From 7c8ee5aec561eb210aea25944e3f81a2a3a37ea4 Mon Sep 17 00:00:00 2001 From: aeslampanah Date: Thu, 13 Apr 2023 07:05:29 -0400 Subject: [PATCH] Updated tokenconvert.py script to add support for SentencePiece and WordPiece tokenizers, updated arguments --- tokenconvert.py | 49 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 14 deletions(-) mode change 100644 => 100755 tokenconvert.py diff --git a/tokenconvert.py b/tokenconvert.py old mode 100644 new mode 100755 index 06b83affb..9d2dd6b68 --- a/tokenconvert.py +++ b/tokenconvert.py @@ -4,21 +4,23 @@ import tempfile import sys from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors -if len(sys.argv) < 1: - print("Usage: python tokenconvert.py token-dir [dir-output]") +if len(sys.argv) < 3: + print("Usage: python tokenconvert.py tokenizer_type token-dir [dir-output]") + print(" tokenizer_type: The type of tokenizer (check the model information), eg: BPE, WordPiece, SentencePiece.") print(" token-dir: Directory of the model containing the tokenizer.json. Example: 'bigscience/bloomz-560m'") print(" dir-output: directory where the output file will be written, eg: ./tokenizer.model , by default writes to the same directory.") sys.exit(1) -token_dir = sys.argv[1] +tokenizer_type = sys.argv[1] +token_dir = sys.argv[2] -if len(sys.argv) < 3: +if len(sys.argv) < 4: dir_out = token_dir else: - dir_out = sys.argv[2] + dir_out = sys.argv[3] -def load_tokenizer_from_json(json_path, special_tokens_map_path, tokenizer_config_path): +def load_tokenizer_from_json(json_path, special_tokens_map_path, tokenizer_config_path, tokenizer_type): with open(json_path, "r", encoding="utf-8") as f: json_data = json.load(f) with open(special_tokens_map_path, "r", encoding="utf-8") as f: @@ -32,15 +34,32 @@ def load_tokenizer_from_json(json_path, special_tokens_map_path, tokenizer_confi with tempfile.NamedTemporaryFile(mode="w", delete=False) as vocab_file: json.dump(vocab, vocab_file) - with tempfile.NamedTemporaryFile(mode="w", delete=False) as merges_file: - merges_file.write("\n".join(merges)) - tokenizer = Tokenizer(models.BPE(vocab_file.name, merges_file.name)) + if tokenizer_type == "BPE": + with tempfile.NamedTemporaryFile(mode="w", delete=False) as merges_file: + merges_file.write("\n".join(merges)) + + tokenizer = Tokenizer(models.BPE.from_file(vocab_file.name, merges_file.name)) + os.unlink(merges_file.name) + + tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) + tokenizer.decoder = decoders.ByteLevel() + + elif tokenizer_type == "WordPiece": + tokenizer = Tokenizer(models.WordPiece.from_file(vocab_file.name)) + tokenizer.pre_tokenizer = pre_tokenizers.Whitespace() + tokenizer.decoder = decoders.WordPiece() + + elif tokenizer_type == "SentencePiece": + tokenizer = Tokenizer(models.SentencePiece.from_file(vocab_file.name)) + tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit() + tokenizer.decoder = decoders.SentencePiece() + + + else: + raise ValueError(f"Unsupported tokenizer type: {tokenizer_type}") + os.unlink(vocab_file.name) - os.unlink(merges_file.name) - - tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) - tokenizer.decoder = decoders.ByteLevel() bos_token_id = tokenizer.token_to_id(special_tokens_map["bos_token"]) eos_token_id = tokenizer.token_to_id(special_tokens_map["eos_token"]) @@ -56,13 +75,15 @@ def load_tokenizer_from_json(json_path, special_tokens_map_path, tokenizer_confi return tokenizer + if __name__ == "__main__": input_json_path = os.path.join(token_dir, "tokenizer.json") special_tokens_map_path = os.path.join(token_dir, "special_tokens_map.json") tokenizer_config_path = os.path.join(token_dir, "tokenizer_config.json") output_model_path = os.path.join(dir_out, "tokenizer.model") - tokenizer = load_tokenizer_from_json(input_json_path, special_tokens_map_path, tokenizer_config_path) + tokenizer = load_tokenizer_from_json(input_json_path, special_tokens_map_path, tokenizer_config_path, tokenizer_type) print(f"Saving.. tokenizer.model to {output_model_path}") tokenizer.save(output_model_path) print(f"Saved tokenizer.model to {output_model_path}") +