convert-falcon-hf-to-gguf: Support --vocab-only option, bail out if no tokenizer.json

This commit is contained in:
KerfuffleV2 2023-08-29 04:21:01 -06:00
parent 8534197f14
commit 61911ca4db

View file

@ -8,6 +8,7 @@ import struct
import json import json
import numpy as np import numpy as np
import torch import torch
import argparse
from typing import Any, List from typing import Any, List
from pathlib import Path from pathlib import Path
@ -47,17 +48,22 @@ def count_model_parts(dir_model: str) -> int:
return num_parts return num_parts
if len(sys.argv) < 3: def parse_args() -> argparse.Namespace:
print(f"Usage: python {sys.argv[0]} dir-model ftype\n") parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file")
print(" ftype == 0 -> float32") parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
print(" ftype == 1 -> float16") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1)
return parser.parse_args()
args = parse_args()
dir_model = args.model
ftype = args.ftype
if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1) sys.exit(1)
# output in the same directory as the model
dir_model = sys.argv[1]
last_dir = os.path.basename(os.path.normpath(dir_model))
# possible tensor data types # possible tensor data types
# ftype == 0 -> float32 # ftype == 0 -> float32
# ftype == 1 -> float16 # ftype == 1 -> float16
@ -65,25 +71,21 @@ last_dir = os.path.basename(os.path.normpath(dir_model))
# map from ftype to string # map from ftype to string
ftype_str = ["f32", "f16"] ftype_str = ["f32", "f16"]
ftype = 1 if args.outfile is not None:
if len(sys.argv) > 2: fname_out = args.outfile
ftype = int(sys.argv[2]) else:
if ftype < 0 or ftype > 1: # output in the same directory as the model by default
print("Invalid ftype: " + str(ftype)) fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
sys.exit(1) print("gguf: loading model "+dir_model.name)
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf" with open(dir_model / "config.json", "r", encoding="utf-8") as f:
print("gguf: loading model "+last_dir)
with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
hparams = json.load(f) hparams = json.load(f)
if hparams["architectures"][0] != "RWForCausalLM": if hparams["architectures"][0] != "RWForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0]) print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit() sys.exit(1)
# get number of model parts # get number of model parts
num_parts = count_model_parts(dir_model) num_parts = count_model_parts(dir_model)
@ -117,49 +119,53 @@ tokens: List[bytearray] = []
scores: List[float] = [] scores: List[float] = []
toktypes: List[int] = [] toktypes: List[int] = []
if Path(dir_model + "/tokenizer.json").is_file(): tokenizer_json_file = dir_model / 'tokenizer.json'
# gpt2 tokenizer if not tokenizer_json_file.is_file():
gguf_writer.add_tokenizer_model("gpt2") print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
sys.exit(1)
with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f: # gpt2 tokenizer
tokenizer_json = json.load(f) gguf_writer.add_tokenizer_model("gpt2")
print("gguf: get gpt2 tokenizer vocab") with open(tokenizer_json_file, "r", encoding="utf-8") as f:
tokenizer_json = json.load(f)
vocab_size = len(tokenizer_json["model"]["vocab"]) print("gguf: get gpt2 tokenizer vocab")
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py vocab_size = len(tokenizer_json["model"]["vocab"])
tokenizer = AutoTokenizer.from_pretrained(dir_model)
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
byte_encoder = bytes_to_unicode() tokenizer = AutoTokenizer.from_pretrained(dir_model)
byte_decoder = {v: k for k, v in byte_encoder.items()}
for i in range(vocab_size): reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
if i in reverse_vocab: byte_encoder = bytes_to_unicode()
try: byte_decoder = {v: k for k, v in byte_encoder.items()}
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
except KeyError:
text = bytearray()
for c in reverse_vocab[i]:
if ord(c) < 256: # single byte character
text.append(byte_decoder[ord(c)])
else: # multibyte special token character
text.extend(c.encode('utf-8'))
else:
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
pad_token = f"[PAD{i}]".encode("utf8")
text = bytearray(pad_token)
tokens.append(text) for i in range(vocab_size):
scores.append(0.0) # dymmy if i in reverse_vocab:
toktypes.append(gguf.TokenType.NORMAL) # dummy try:
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
except KeyError:
text = bytearray()
for c in reverse_vocab[i]:
if ord(c) < 256: # single byte character
text.append(byte_decoder[ord(c)])
else: # multibyte special token character
text.extend(c.encode('utf-8'))
else:
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
pad_token = f"[PAD{i}]".encode("utf8")
text = bytearray(pad_token)
gguf_writer.add_token_list(tokens) tokens.append(text)
gguf_writer.add_token_scores(scores) scores.append(0.0) # dymmy
gguf_writer.add_token_types(toktypes) toktypes.append(gguf.TokenType.NORMAL) # dummy
special_vocab = gguf.SpecialVocab(Path(dir_model)) gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
special_vocab.add_to_gguf(gguf_writer) special_vocab.add_to_gguf(gguf_writer)
# TENSORS # TENSORS
@ -183,8 +189,10 @@ else:
) )
for part_name in part_names: for part_name in part_names:
if args.vocab_only:
break
print("gguf: loading model part '" + part_name + "'") print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") model_part = torch.load(dir_model / part_name, map_location="cpu")
for name in model_part.keys(): for name in model_part.keys():
data = model_part[name] data = model_part[name]
@ -244,10 +252,11 @@ print("gguf: write header")
gguf_writer.write_header_to_file() gguf_writer.write_header_to_file()
print("gguf: write metadata") print("gguf: write metadata")
gguf_writer.write_kv_data_to_file() gguf_writer.write_kv_data_to_file()
print("gguf: write tensors") if not args.vocab_only:
gguf_writer.write_tensors_to_file() print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close() gguf_writer.close()
print("gguf: model successfully exported to '" + fname_out + "'") print(f"gguf: model successfully exported to '{fname_out}'")
print("") print("")