diff --git a/tests/test-tokenizer-0-falcon.py b/tests/test-tokenizer-0-bpe.py similarity index 100% rename from tests/test-tokenizer-0-falcon.py rename to tests/test-tokenizer-0-bpe.py diff --git a/tests/test-tokenizer-0-deepseek-coder.py b/tests/test-tokenizer-0-deepseek-coder.py deleted file mode 100644 index b99840e1b..000000000 --- a/tests/test-tokenizer-0-deepseek-coder.py +++ /dev/null @@ -1,83 +0,0 @@ -# tests with BPE tokenizer - -import argparse - -from transformers import AutoTokenizer - -parser = argparse.ArgumentParser() -parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") -parser.add_argument("--fname-tok", help="path to a text file to tokenize") -args = parser.parse_args() - -dir_tokenizer = args.dir_tokenizer - -tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) - -tests = [ - "", - " ", - " ", - " ", - "\t", - "\n", - "\t\n", - "Hello world", - " Hello world", - "Hello World", - " Hello World", - " Hello World!", - "Hello, world!", - " Hello, world!", - " this is πŸ¦™.cpp", - "w048 7tuijk dsdfhu", - "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ", - "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰", - "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)", - "Hello", - " Hello", - " Hello", - " Hello", - " Hello", - " Hello\n Hello", - "\n =", - "' era", - "Hello, y'all! How are you 😁 ?ζˆ‘ζƒ³εœ¨appleε·₯作1314151倩~", -] - -for text in tests: - print('text: ', text) - print(tokenizer.encode(text)) - print(tokenizer.decode(tokenizer.encode(text))) - -print("\n\ntests for C++:\n") -for text in tests: - res = tokenizer.encode(text) - - k = text.replace('\n', '\\n') - k = k.replace('\t', '\\t') - k = '"' + k + '"' - print("{ %-24s, { " % k, end='') - for x in res: - print("%7d," % x, end='') - print(" }, },") - -print(tokenizer.encode('hello')) -print(tokenizer.encode('world')) -print(tokenizer.encode(' world')) -print(tokenizer.encode('hello world')) - -fname_tok = args.fname_tok -if fname_tok: - print('tokenizing file: ', fname_tok) - fname_out = fname_tok + '.tok' - with open(fname_tok, 'r', encoding='utf-8') as f: - lines = f.readlines() - s = ''.join(lines) - res = tokenizer.encode(s) - # write to file - with open(fname_out, 'w', encoding='utf-8') as f: - for x in res: - f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n') - print('len(res): ', len(res)) - print('len(lines): ', len(lines)) - print('results written to: ', fname_out) diff --git a/tests/test-tokenizer-0-deepseek-llm.py b/tests/test-tokenizer-0-deepseek-llm.py deleted file mode 100644 index b99840e1b..000000000 --- a/tests/test-tokenizer-0-deepseek-llm.py +++ /dev/null @@ -1,83 +0,0 @@ -# tests with BPE tokenizer - -import argparse - -from transformers import AutoTokenizer - -parser = argparse.ArgumentParser() -parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") -parser.add_argument("--fname-tok", help="path to a text file to tokenize") -args = parser.parse_args() - -dir_tokenizer = args.dir_tokenizer - -tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) - -tests = [ - "", - " ", - " ", - " ", - "\t", - "\n", - "\t\n", - "Hello world", - " Hello world", - "Hello World", - " Hello World", - " Hello World!", - "Hello, world!", - " Hello, world!", - " this is πŸ¦™.cpp", - "w048 7tuijk dsdfhu", - "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ", - "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰", - "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)", - "Hello", - " Hello", - " Hello", - " Hello", - " Hello", - " Hello\n Hello", - "\n =", - "' era", - "Hello, y'all! How are you 😁 ?ζˆ‘ζƒ³εœ¨appleε·₯作1314151倩~", -] - -for text in tests: - print('text: ', text) - print(tokenizer.encode(text)) - print(tokenizer.decode(tokenizer.encode(text))) - -print("\n\ntests for C++:\n") -for text in tests: - res = tokenizer.encode(text) - - k = text.replace('\n', '\\n') - k = k.replace('\t', '\\t') - k = '"' + k + '"' - print("{ %-24s, { " % k, end='') - for x in res: - print("%7d," % x, end='') - print(" }, },") - -print(tokenizer.encode('hello')) -print(tokenizer.encode('world')) -print(tokenizer.encode(' world')) -print(tokenizer.encode('hello world')) - -fname_tok = args.fname_tok -if fname_tok: - print('tokenizing file: ', fname_tok) - fname_out = fname_tok + '.tok' - with open(fname_tok, 'r', encoding='utf-8') as f: - lines = f.readlines() - s = ''.join(lines) - res = tokenizer.encode(s) - # write to file - with open(fname_out, 'w', encoding='utf-8') as f: - for x in res: - f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n') - print('len(res): ', len(res)) - print('len(lines): ', len(lines)) - print('results written to: ', fname_out) diff --git a/tests/test-tokenizer-0-llama.py b/tests/test-tokenizer-0-spm.py similarity index 100% rename from tests/test-tokenizer-0-llama.py rename to tests/test-tokenizer-0-spm.py