Added token conversion script to convert from tokenizer.json format to tokenizer.model format, tested with bigscience models

2023-04-09 18:33:03 -04:00 · 2023-04-09 18:33:03 -04:00 · a78c42d5da
commit a78c42d5da
parent aaf3b23deb
1 changed files with 68 additions and 0 deletions
--- a/tokenconvert.py
+++ b/tokenconvert.py
@ -0,0 +1,68 @@
+import json
+import os
+import tempfile
+import sys
+from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
+
+if len(sys.argv) < 1:
+    print("Usage: python tokenconvert.py token-dir [dir-output]")
+    print("  token-dir: Directory of the model containing the tokenizer.json. Example: 'bigscience/bloomz-560m'")
+    print("  dir-output: directory where the output file will be written, eg: ./tokenizer.model , by default writes to the same directory.")
+    sys.exit(1)
+
+token_dir = sys.argv[1]
+
+if len(sys.argv) < 3:
+    dir_out = token_dir
+else:
+    dir_out = sys.argv[2]
+
+
+def load_tokenizer_from_json(json_path, special_tokens_map_path, tokenizer_config_path):
+    with open(json_path, "r", encoding="utf-8") as f:
+        json_data = json.load(f)
+    with open(special_tokens_map_path, "r", encoding="utf-8") as f:
+        special_tokens_map = json.load(f)
+    with open(tokenizer_config_path, "r", encoding="utf-8") as f:
+        tokenizer_config = json.load(f)
+
+    model_data = json_data["model"]
+    vocab = model_data["vocab"]
+    merges = model_data["merges"]
+
+    with tempfile.NamedTemporaryFile(mode="w", delete=False) as vocab_file:
+        json.dump(vocab, vocab_file)
+    with tempfile.NamedTemporaryFile(mode="w", delete=False) as merges_file:
+        merges_file.write("\n".join(merges))
+
+    tokenizer = Tokenizer(models.BPE(vocab_file.name, merges_file.name))
+    os.unlink(vocab_file.name)
+    os.unlink(merges_file.name)
+
+    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
+    tokenizer.decoder = decoders.ByteLevel()
+
+    bos_token_id = tokenizer.token_to_id(special_tokens_map["bos_token"])
+    eos_token_id = tokenizer.token_to_id(special_tokens_map["eos_token"])
+
+    tokenizer.post_processor = processors.TemplateProcessing(
+        single=f"{special_tokens_map['bos_token']} $A {special_tokens_map['eos_token']}",
+        pair=f"{special_tokens_map['bos_token']} $A {special_tokens_map['eos_token']} {special_tokens_map['bos_token']} $B {special_tokens_map['eos_token']}",
+        special_tokens=[
+            (special_tokens_map["bos_token"], bos_token_id),
+            (special_tokens_map["eos_token"], eos_token_id),
+        ],
+    )
+
+    return tokenizer
+
+if __name__ == "__main__":
+    input_json_path = os.path.join(token_dir, "tokenizer.json")
+    special_tokens_map_path = os.path.join(token_dir, "special_tokens_map.json")
+    tokenizer_config_path = os.path.join(token_dir, "tokenizer_config.json")
+    output_model_path = os.path.join(dir_out, "tokenizer.model")
+
+    tokenizer = load_tokenizer_from_json(input_json_path, special_tokens_map_path, tokenizer_config_path)
+    print(f"Saving.. tokenizer.model to {output_model_path}")
+    tokenizer.save(output_model_path)
+    print(f"Saved tokenizer.model to {output_model_path}")