From a78c42d5dae86fe45096b32192eb27f00ee714cd Mon Sep 17 00:00:00 2001 From: aeslampanah Date: Sun, 9 Apr 2023 18:33:03 -0400 Subject: [PATCH 1/4] Added token conversion script to convert from tokenizer.json format to tokenizer.model format, tested with bigscience models --- tokenconvert.py | 68 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 tokenconvert.py diff --git a/tokenconvert.py b/tokenconvert.py new file mode 100644 index 000000000..06b83affb --- /dev/null +++ b/tokenconvert.py @@ -0,0 +1,68 @@ +import json +import os +import tempfile +import sys +from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors + +if len(sys.argv) < 1: + print("Usage: python tokenconvert.py token-dir [dir-output]") + print(" token-dir: Directory of the model containing the tokenizer.json. Example: 'bigscience/bloomz-560m'") + print(" dir-output: directory where the output file will be written, eg: ./tokenizer.model , by default writes to the same directory.") + sys.exit(1) + +token_dir = sys.argv[1] + +if len(sys.argv) < 3: + dir_out = token_dir +else: + dir_out = sys.argv[2] + + +def load_tokenizer_from_json(json_path, special_tokens_map_path, tokenizer_config_path): + with open(json_path, "r", encoding="utf-8") as f: + json_data = json.load(f) + with open(special_tokens_map_path, "r", encoding="utf-8") as f: + special_tokens_map = json.load(f) + with open(tokenizer_config_path, "r", encoding="utf-8") as f: + tokenizer_config = json.load(f) + + model_data = json_data["model"] + vocab = model_data["vocab"] + merges = model_data["merges"] + + with tempfile.NamedTemporaryFile(mode="w", delete=False) as vocab_file: + json.dump(vocab, vocab_file) + with tempfile.NamedTemporaryFile(mode="w", delete=False) as merges_file: + merges_file.write("\n".join(merges)) + + tokenizer = Tokenizer(models.BPE(vocab_file.name, merges_file.name)) + os.unlink(vocab_file.name) + os.unlink(merges_file.name) + + tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) + tokenizer.decoder = decoders.ByteLevel() + + bos_token_id = tokenizer.token_to_id(special_tokens_map["bos_token"]) + eos_token_id = tokenizer.token_to_id(special_tokens_map["eos_token"]) + + tokenizer.post_processor = processors.TemplateProcessing( + single=f"{special_tokens_map['bos_token']} $A {special_tokens_map['eos_token']}", + pair=f"{special_tokens_map['bos_token']} $A {special_tokens_map['eos_token']} {special_tokens_map['bos_token']} $B {special_tokens_map['eos_token']}", + special_tokens=[ + (special_tokens_map["bos_token"], bos_token_id), + (special_tokens_map["eos_token"], eos_token_id), + ], + ) + + return tokenizer + +if __name__ == "__main__": + input_json_path = os.path.join(token_dir, "tokenizer.json") + special_tokens_map_path = os.path.join(token_dir, "special_tokens_map.json") + tokenizer_config_path = os.path.join(token_dir, "tokenizer_config.json") + output_model_path = os.path.join(dir_out, "tokenizer.model") + + tokenizer = load_tokenizer_from_json(input_json_path, special_tokens_map_path, tokenizer_config_path) + print(f"Saving.. tokenizer.model to {output_model_path}") + tokenizer.save(output_model_path) + print(f"Saved tokenizer.model to {output_model_path}") From 74b92ff6b834a3e8d854e4e4986bdb845e382864 Mon Sep 17 00:00:00 2001 From: aeslampanah Date: Mon, 10 Apr 2023 17:21:33 -0400 Subject: [PATCH 2/4] Add helper script to convert hf (pytorch) models into ggml format --- convert-hf-to-ggml-v2.py | 181 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 181 insertions(+) create mode 100644 convert-hf-to-ggml-v2.py diff --git a/convert-hf-to-ggml-v2.py b/convert-hf-to-ggml-v2.py new file mode 100644 index 000000000..86d4f64f2 --- /dev/null +++ b/convert-hf-to-ggml-v2.py @@ -0,0 +1,181 @@ +import io +import os +import sys +import struct +import json +import torch +import numpy as np +import tempfile +import argparse +from tqdm import tqdm + +from transformers import AutoTokenizer, AutoConfig + +conv_map = { + 'word_embeddings': 'tok_embeddings', + 'word_embeddings_layernorm': 'norm', + 'input_layernorm': 'attention_norm', + 'self_attention.query_key_value': 'attention.query_key_value', + 'self_attention.dense': 'attention.wo', + 'post_attention_layernorm': 'ffn_norm', + 'mlp.dense_h_to_4h': 'feed_forward.w1', + 'mlp.dense_4h_to_h': 'feed_forward.w2', + 'ln_f': 'output_norm', + 'lm_head': 'output', +} + +parser = argparse.ArgumentParser(description='Convert a model from HF format to GGML format.') +parser.add_argument('model_name', type=str, help='directory of the model to convert. Example: "bigscience/bloomz-560m"') +parser.add_argument('dir_output', type=str, help='directory where the output file will be written') +parser.add_argument('--use-f32', action='store_true', help='if present, use float32 instead of float16') +parser.add_argument('--debug', action='store_true', help='if present, dump the progress as it happens') +args = parser.parse_args() + +model_name = args.model_name +dir_out = args.dir_output + +os.makedirs(dir_out, exist_ok=True) + +ftype_str = ["f32", "f16"] +ftype = 0 if args.use_f32 else 1 +debug_flag = args.debug + +tokenizer = AutoTokenizer.from_pretrained(model_name) +config = AutoConfig.from_pretrained(model_name) +hparams = config.to_dict() +print("Loading model: ", model_name) + +# Save the model to disk +model_dir = f"{model_name}_tmp" +os.makedirs(model_dir, exist_ok=True) +config.save_pretrained(model_dir) + +fname_out = dir_out + f"/ggml-model-{model_name.split('/')[-1]}-{ftype_str[ftype]}.bin" +fout = open(fname_out, "wb") + +hparams["multiple_of"] = 1 +fout.write(struct.pack("i", 0x67676d6c)) +fout.write(struct.pack("i", hparams["vocab_size"])) +fout.write(struct.pack("i", hparams["hidden_size"])) +fout.write(struct.pack("i", hparams["multiple_of"])) +fout.write(struct.pack("i", hparams["n_head"])) +fout.write(struct.pack("i", hparams["n_layer"])) +fout.write(struct.pack("i", ftype)) + +dot_token = tokenizer.encode(".")[0] +for i in range(hparams["vocab_size"]): + text = tokenizer.decode([i]).encode('utf-8') + fout.write(struct.pack("i", len(text))) + fout.write(text) + +# Create temporary files for chunks +temp_files = {} + +# Define the chunk size +chunk_size = 1000 * 1000 * 1024 + +# Load the PyTorch model weights from the saved files +# Find the files in the model directory +model_files = sorted([f for f in os.listdir(model_name) if f.startswith("pytorch_model") and f.endswith(".bin")]) + +added_head = False +state_dict = {} +for model_file in tqdm(model_files, desc="Processing model files in: " + model_name): + file_path = os.path.join(model_name, model_file) + model_part = torch.load(file_path, map_location=torch.device('cpu')) + state_dict.update(model_part) + + # Add the missing lm_head.weight tensor + lm_head_weight_key = 'lm_head.weight' + word_embeddings_weight_key = 'word_embeddings.weight' + if lm_head_weight_key not in state_dict and not added_head: + # Use the word_embeddings.weight tensor for the lm_head.weight + word_embeddings_weight = state_dict[word_embeddings_weight_key] + + # Add the tensor to the state_dict + state_dict[lm_head_weight_key] = word_embeddings_weight + + added_head = True + + + for name in tqdm(state_dict.keys(), desc="Processing nodes"): + src = name + nn = name.split(".") + + # Handle layer indices + if nn[0].isdigit(): + layer_idx = nn[0] + nn = nn[1:] + else: + layer_idx = None + + if debug_flag: + if nn[0].isdigit(): + print("For Layer: " + layer_idx) + + if nn[0] == "h": + nn[0] = "layers" + mapped = conv_map[".".join(nn[2:-1])] + if layer_idx is not None: + name = f"{layer_idx}.{nn[0]}.{layer_idx}.{mapped}.{nn[-1]}" + else: + name = ".".join(nn[:2] + [mapped] + nn[-1:]) + else: + mapped = conv_map[".".join(nn[:-1])] + if layer_idx is not None: + name = f"{layer_idx}.{mapped}.{nn[-1]}" + else: + name = ".".join([mapped] + nn[-1:]) + + if "query_key_value" in src: + q, k, v = state_dict[src].reshape(config.n_head, 3, -1).unbind(1) + state_dict[src] = torch.cat([q, k, v], dim=0).reshape_as(state_dict[src]) + + if debug_flag: + print(src, ' -> ', name) + tensor = state_dict[src].cpu() + + # If the tensor dtype is bfloat16, convert it to float32 + if tensor.dtype == torch.bfloat16: + tensor = tensor.to(torch.float32) + + data = tensor.squeeze().numpy() + data = data.astype(np.float32) + + n_dims = len(data.shape) + + if debug_flag: + print(name, n_dims, data.shape) + + # Check if the current data type is float16 + if data.dtype == np.float16: + ftype_cur = 1 + else: + ftype_cur = 0 + + # If the specified ftype is float16 and the current ftype is not, convert data to float16 + if ftype == 1 and ftype_cur == 0 and n_dims > 1: + if debug_flag: + print("Converting to float16") + data = data.astype(np.float16) + ftype_cur = 1 + + str = name.encode('utf-8') + fout.write(struct.pack("iii", n_dims, len(str), ftype_cur)) + for i in range(n_dims): + fout.write(struct.pack("i", data.shape[n_dims - 1 - i])) + fout.write(str) + + # Write data to file in chunks + data_buffer = data.tobytes() + data_len = len(data_buffer) + for offset in range(0, data_len, chunk_size): + chunk = data_buffer[offset: offset + chunk_size] + fout.write(chunk) + + # Free some memory as we don't need the previous layer's state + state_dict = {} + +fout.close() +print("Done. Output file: " + fname_out) +print("") \ No newline at end of file From 7c8ee5aec561eb210aea25944e3f81a2a3a37ea4 Mon Sep 17 00:00:00 2001 From: aeslampanah Date: Thu, 13 Apr 2023 07:05:29 -0400 Subject: [PATCH 3/4] Updated tokenconvert.py script to add support for SentencePiece and WordPiece tokenizers, updated arguments --- tokenconvert.py | 49 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 14 deletions(-) mode change 100644 => 100755 tokenconvert.py diff --git a/tokenconvert.py b/tokenconvert.py old mode 100644 new mode 100755 index 06b83affb..9d2dd6b68 --- a/tokenconvert.py +++ b/tokenconvert.py @@ -4,21 +4,23 @@ import tempfile import sys from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors -if len(sys.argv) < 1: - print("Usage: python tokenconvert.py token-dir [dir-output]") +if len(sys.argv) < 3: + print("Usage: python tokenconvert.py tokenizer_type token-dir [dir-output]") + print(" tokenizer_type: The type of tokenizer (check the model information), eg: BPE, WordPiece, SentencePiece.") print(" token-dir: Directory of the model containing the tokenizer.json. Example: 'bigscience/bloomz-560m'") print(" dir-output: directory where the output file will be written, eg: ./tokenizer.model , by default writes to the same directory.") sys.exit(1) -token_dir = sys.argv[1] +tokenizer_type = sys.argv[1] +token_dir = sys.argv[2] -if len(sys.argv) < 3: +if len(sys.argv) < 4: dir_out = token_dir else: - dir_out = sys.argv[2] + dir_out = sys.argv[3] -def load_tokenizer_from_json(json_path, special_tokens_map_path, tokenizer_config_path): +def load_tokenizer_from_json(json_path, special_tokens_map_path, tokenizer_config_path, tokenizer_type): with open(json_path, "r", encoding="utf-8") as f: json_data = json.load(f) with open(special_tokens_map_path, "r", encoding="utf-8") as f: @@ -32,15 +34,32 @@ def load_tokenizer_from_json(json_path, special_tokens_map_path, tokenizer_confi with tempfile.NamedTemporaryFile(mode="w", delete=False) as vocab_file: json.dump(vocab, vocab_file) - with tempfile.NamedTemporaryFile(mode="w", delete=False) as merges_file: - merges_file.write("\n".join(merges)) - tokenizer = Tokenizer(models.BPE(vocab_file.name, merges_file.name)) + if tokenizer_type == "BPE": + with tempfile.NamedTemporaryFile(mode="w", delete=False) as merges_file: + merges_file.write("\n".join(merges)) + + tokenizer = Tokenizer(models.BPE.from_file(vocab_file.name, merges_file.name)) + os.unlink(merges_file.name) + + tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) + tokenizer.decoder = decoders.ByteLevel() + + elif tokenizer_type == "WordPiece": + tokenizer = Tokenizer(models.WordPiece.from_file(vocab_file.name)) + tokenizer.pre_tokenizer = pre_tokenizers.Whitespace() + tokenizer.decoder = decoders.WordPiece() + + elif tokenizer_type == "SentencePiece": + tokenizer = Tokenizer(models.SentencePiece.from_file(vocab_file.name)) + tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit() + tokenizer.decoder = decoders.SentencePiece() + + + else: + raise ValueError(f"Unsupported tokenizer type: {tokenizer_type}") + os.unlink(vocab_file.name) - os.unlink(merges_file.name) - - tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) - tokenizer.decoder = decoders.ByteLevel() bos_token_id = tokenizer.token_to_id(special_tokens_map["bos_token"]) eos_token_id = tokenizer.token_to_id(special_tokens_map["eos_token"]) @@ -56,13 +75,15 @@ def load_tokenizer_from_json(json_path, special_tokens_map_path, tokenizer_confi return tokenizer + if __name__ == "__main__": input_json_path = os.path.join(token_dir, "tokenizer.json") special_tokens_map_path = os.path.join(token_dir, "special_tokens_map.json") tokenizer_config_path = os.path.join(token_dir, "tokenizer_config.json") output_model_path = os.path.join(dir_out, "tokenizer.model") - tokenizer = load_tokenizer_from_json(input_json_path, special_tokens_map_path, tokenizer_config_path) + tokenizer = load_tokenizer_from_json(input_json_path, special_tokens_map_path, tokenizer_config_path, tokenizer_type) print(f"Saving.. tokenizer.model to {output_model_path}") tokenizer.save(output_model_path) print(f"Saved tokenizer.model to {output_model_path}") + From 902075752a407da013d429a96cd66eac8b75f0bb Mon Sep 17 00:00:00 2001 From: aeslampanah Date: Thu, 13 Apr 2023 07:58:45 -0400 Subject: [PATCH 4/4] Add sentencepiece processor --- tokenconvert.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tokenconvert.py b/tokenconvert.py index 9d2dd6b68..f5b6d1d53 100755 --- a/tokenconvert.py +++ b/tokenconvert.py @@ -2,6 +2,7 @@ import json import os import tempfile import sys +import sentencepiece as spm from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors if len(sys.argv) < 3: @@ -51,10 +52,11 @@ def load_tokenizer_from_json(json_path, special_tokens_map_path, tokenizer_confi tokenizer.decoder = decoders.WordPiece() elif tokenizer_type == "SentencePiece": - tokenizer = Tokenizer(models.SentencePiece.from_file(vocab_file.name)) - tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit() - tokenizer.decoder = decoders.SentencePiece() - + sp_model = spm.SentencePieceProcessor() + sp_model.Load(vocab_file.name) + tokenizer = Tokenizer(models.Model.from_sentencepiece(sp_model)) + tokenizer.pre_tokenizer = pre_tokenizers.Sequence([pre_tokenizers.Metaspace(), pre_tokenizers.Split()]) + tokenizer.decoder = decoders.Sequence([decoders.Split(), decoders.Metaspace()]) else: raise ValueError(f"Unsupported tokenizer type: {tokenizer_type}")