From eb8b3264f6054a9f688c3df8ff441aca400ad426 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 27 Aug 2023 00:41:44 +0300 Subject: [PATCH] tests : add test-tokenizer-1.py --- tests/test-tokenizer-0.py | 7 ++++ tests/test-tokenizer-1.py | 83 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 tests/test-tokenizer-1.py diff --git a/tests/test-tokenizer-0.py b/tests/test-tokenizer-0.py index a21e9ed70..bc164ee29 100644 --- a/tests/test-tokenizer-0.py +++ b/tests/test-tokenizer-0.py @@ -1,3 +1,5 @@ +# tests with SPM tokenizer + import os import sys import argparse @@ -70,6 +72,11 @@ for text in tests: print("%7d," % x, end='') print(" }, },") +print(tokenizer.encode('hello')) +print(tokenizer.encode('world')) +print(tokenizer.encode(' world')) +print(tokenizer.encode('hello world')) + fname_tok = args.fname_tok if fname_tok: print('tokenizing file: ', fname_tok) diff --git a/tests/test-tokenizer-1.py b/tests/test-tokenizer-1.py new file mode 100644 index 000000000..9c8c1c7d1 --- /dev/null +++ b/tests/test-tokenizer-1.py @@ -0,0 +1,83 @@ +# tests with BPE tokenizer + +import os +import sys +import argparse + +from transformers import AutoTokenizer + +parser = argparse.ArgumentParser() +parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") +parser.add_argument("--fname-tok", help="path to a text file to tokenize") +args = parser.parse_args() + +dir_tokenizer = args.dir_tokenizer + +tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) + +tests = [ + "", + " ", + " ", + " ", + "\t", + "\n", + "\t\n", + "Hello world", + " Hello world", + "Hello World", + " Hello World", + " Hello World!", + "Hello, world!", + " Hello, world!", + " this is πŸ¦™.cpp", + "w048 7tuijk dsdfhu", + "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ", + "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰", + "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)", + "Hello", + " Hello", + " Hello", + " Hello", + " Hello", + " Hello\n Hello", + ] + +for text in tests: + print('text: ', text) + print(tokenizer.encode(text)) + print(tokenizer.decode(tokenizer.encode(text))) + +print("\n\ntests for C++:\n") +for text in tests: + res = tokenizer.encode(text) + + k = text.replace('\n', '\\n') + k = k.replace('\t', '\\t') + k = '"' + k + '"' + print("{ %-24s, { " % k, end='') + for x in res: + print("%7d," % x, end='') + print(" }, },") + +print(tokenizer.encode('hello')) +print(tokenizer.encode('world')) +print(tokenizer.encode(' world')) +print(tokenizer.encode('hello world')) + +fname_tok = args.fname_tok +if fname_tok: + print('tokenizing file: ', fname_tok) + fname_out = fname_tok + '.tok' + with open(fname_tok, 'r') as f: + lines = f.readlines() + s = ''.join(lines) + res = tokenizer.encode(s) + # write to file + with open(fname_out, 'w') as f: + for x in res: + f.write(str(x) + ' ') + f.write('\n') + print('len(res): ', len(res)) + print('len(lines): ', len(lines)) + print('results written to: ', fname_out)