diff --git a/tests/test-tokenizer-0-bpe.py b/tests/test-tokenizer-0-bpe.py deleted file mode 100644 index 33a272441..000000000 --- a/tests/test-tokenizer-0-bpe.py +++ /dev/null @@ -1,117 +0,0 @@ -# tests with BPE tokenizer -# -# sample usage: -# -# python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/Meta-Llama-3-8B-Instruct/ -# python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/falcon-7b/ -# python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/deepseek-coder-6.7b-instruct/ -# - -import argparse - -from transformers import AutoTokenizer - -parser = argparse.ArgumentParser() -parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") -parser.add_argument("--fname-tok", help="path to a text file to tokenize") -args = parser.parse_args() - -dir_tokenizer = args.dir_tokenizer - -tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) - -tests = [ - "", - " ", - " ", - " ", - "\t", - "\n", - "\n\n", - "\n\n\n", - "\t\n", - "Hello world", - " Hello world", - "Hello World", - " Hello World", - " Hello World!", - "Hello, world!", - " Hello, world!", - " this is πŸ¦™.cpp", - "w048 7tuijk dsdfhu", - "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ", - "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰", - "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)", - "Hello", - " Hello", - " Hello", - " Hello", - " Hello", - " Hello\n Hello", - " (", - "\n =", - "' era", - "Hello, y'all! How are you 😁 ?ζˆ‘ζƒ³εœ¨appleε·₯作1314151倩~", - "3", - "33", - "333", - "3333", - "33333", - "333333", - "3333333", - "33333333", - "333333333", -] - -for text in tests: - print('text: ', text) - print(tokenizer.encode(text)) - print(tokenizer.decode(tokenizer.encode(text))) - -print("\n\ntests for C++:\n") -for text in tests: - res = tokenizer.encode(text) - - k = text.replace('\n', '\\n') - k = k.replace('\t', '\\t') - k = '"' + k + '"' - print("{ %-24s, { " % k, end='') - for x in res: - print("%7d," % x, end='') - print(" }, },") - -print(tokenizer.encode('hello')) -print(tokenizer.encode('world')) -print(tokenizer.encode(' world')) -print(tokenizer.encode('hello world')) - -fname_tok = args.fname_tok -if fname_tok: - print('tokenizing file: ', fname_tok) - fname_out = fname_tok + '.tok' - with open(fname_tok, 'r', encoding='utf-8') as f: - lines = f.readlines() - s = ''.join(lines) - res = tokenizer.encode(s) - # write to file - with open(fname_out, 'w', encoding='utf-8') as f: - for x in res: - # LLaMA v3 for some reason strips the space for these tokens (and others) - # if x == 662: - # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') - # elif x == 1174: - # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') - # elif x == 2564: - # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') - # elif x == 758: - # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') - # elif x == 949: - # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') - # elif x == 5354: - # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') - # else: - # f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n') - f.write(str(x) + ' \'' + tokenizer.decode(x).strip() + '\'\n') - print('len(res): ', len(res)) - print('len(lines): ', len(lines)) - print('results written to: ', fname_out) diff --git a/tests/test-tokenizer-0-spm.py b/tests/test-tokenizer-0-spm.py deleted file mode 100644 index be12a6b93..000000000 --- a/tests/test-tokenizer-0-spm.py +++ /dev/null @@ -1,114 +0,0 @@ -# tests with SPM tokenizer -# -# sample usage: -# -# python3 tests/test-tokenizer-0-spm.py ~/Data/huggingface/Llama-2-7b-hf/ -# python3 tests/test-tokenizer-0-spm.py ~/Data/huggingface/CodeLlama-34b-Instruct-hf/ -# - - -import argparse - -from sentencepiece import SentencePieceProcessor - -parser = argparse.ArgumentParser() -parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") -parser.add_argument("--fname-tok", help="path to a text file to tokenize") -args = parser.parse_args() - -dir_tokenizer = args.dir_tokenizer - -tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model') - -tests = [ - "", - " ", - " ", - " ", - "\t", - "\n", - "\n\n", - "\n\n\n", - "\t\n", - "Hello world", - " Hello world", - "Hello World", - " Hello World", - " Hello World!", - "Hello, world!", - " Hello, world!", - " this is πŸ¦™.cpp", - "w048 7tuijk dsdfhu", - "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ", - "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰", - "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)", - "Hello", - " Hello", - " Hello", - " Hello", - " Hello", - " Hello\n Hello", - " (", - "\n =", - "' era", - "Hello, y'all! How are you 😁 ?ζˆ‘ζƒ³εœ¨appleε·₯作1314151倩~", - "3", - "33", - "333", - "3333", - "33333", - "333333", - "3333333", - "33333333", - "333333333", -] - - -for text in tests: - print('text: ', text) - print('\nwith bos:') - print(tokenizer.encode(text, add_bos=True)) - print(tokenizer.decode(tokenizer.encode(text, add_bos=True))) - print('\nwithout bos:') - print(tokenizer.encode(text, add_bos=False)) - print(tokenizer.decode(tokenizer.encode(text, add_bos=False))) - -print("'" + tokenizer.id_to_piece(15043) + "'") # '_Hello' -print("'" + tokenizer.id_to_piece(29871) + "'") # '_' -print("'" + tokenizer.decode([15043]) + "'") # 'Hello' -print("'" + tokenizer.decode([15043, 15043]) + "'") # 'Hello Hello' -print("'" + tokenizer.decode([29871, 15043]) + "'") # ' Hello' -print("'" + tokenizer.decode([29871, 15043, 29871, 15043]) + "'") # ' Hello Hello' - -print("\n\ntests for C++:\n") -for text in tests: - res = tokenizer.encode(text, add_bos=False) - - k = text.replace('\n', '\\n') - k = k.replace('\t', '\\t') - k = '"' + k + '"' - print("{ %-24s, { " % k, end='') - for x in res: - print("%7d," % x, end='') - print(" }, },") - -print(tokenizer.encode('hello')) -print(tokenizer.encode('world')) -print(tokenizer.encode(' world')) -print(tokenizer.encode('hello world')) - -fname_tok = args.fname_tok -if fname_tok: - print('tokenizing file: ', fname_tok) - fname_out = fname_tok + '.tok' - with open(fname_tok, 'r', encoding='utf-8') as f: - lines = f.readlines() - s = ''.join(lines) - res = tokenizer.encode(s, add_bos=True) - # write to file - with open(fname_out, 'w', encoding='utf-8') as f: - for x in res: - f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n') - print('len(res): ', len(res)) - print('len(lines): ', len(lines)) - print('results written to: ', fname_out) diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index 5122757cf..d478f1041 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -55,8 +55,10 @@ // return _k_tests; //} -static std::map> read_tests(const std::string & fname_inp, const std::string & fname_out) { - std::map> tests; +using llama_tests = std::map>; + +static llama_tests read_tests(const std::string & fname_inp, const std::string & fname_out) { + llama_tests tests; std::ifstream ifs_inp(fname_inp); if (!ifs_inp) { @@ -175,12 +177,20 @@ int main(int argc, char **argv) { bool success = true; - const auto k_tests = read_tests(fname_inp, fname_out); + const auto k_tests = [&]() -> llama_tests { + if (!fname_text.empty()) { + return {}; + } - if (k_tests.empty()) { - fprintf(stderr, "%s : error: no tests found\n", __func__); - return 1; - } + const auto res = read_tests(fname_inp, fname_out); + + if (res.empty()) { + fprintf(stderr, "%s : error: no tests found\n", __func__); + exit(1); + } + + return res; + }(); const bool add_special = false; @@ -238,7 +248,17 @@ int main(int argc, char **argv) { fprintf(stderr, "%s : text size: %zu\n", __func__, text.size()); - const std::vector res = llama_tokenize(ctx, text, add_special); + std::vector res; + + { + const auto t_start = ggml_time_us(); + + res = llama_tokenize(ctx, text, add_special); + + const auto t_end = ggml_time_us(); + + fprintf(stderr, "%s : tokenized in %.3f ms (cpp)\n", __func__, (t_end - t_start) / 1000.0); + } fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size()); @@ -252,7 +272,8 @@ int main(int argc, char **argv) { } for (const auto & tok : res) { - ofs << tok << " '" << string_strip(llama_detokenize_bpe(ctx, std::vector{tok})) << "'" << std::endl; + //ofs << tok << " '" << string_strip(llama_detokenize_bpe(ctx, std::vector{tok})) << "'" << std::endl; + ofs << tok << "\n"; } } diff --git a/tests/test-tokenizer-0.py b/tests/test-tokenizer-0.py new file mode 100644 index 000000000..8e7638e42 --- /dev/null +++ b/tests/test-tokenizer-0.py @@ -0,0 +1,46 @@ +import time +import argparse + +from transformers import AutoTokenizer + +parser = argparse.ArgumentParser() +parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") +parser.add_argument("--fname-tok", help="path to a text file to tokenize", required=True) +args = parser.parse_args() + +dir_tokenizer = args.dir_tokenizer +fname_tok = args.fname_tok + +tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) + +print('tokenizing file: ', fname_tok) +fname_out = fname_tok + '.tok' +with open(fname_tok, 'r', encoding='utf-8') as f: + lines = f.readlines() + s = ''.join(lines) + t_start = time.time() + res = tokenizer.encode(s, add_special_tokens=False) + t_end = time.time() + print('\nmain : tokenized in', "{:.3f}".format(1000.0 * (t_end - t_start)), 'ms (py)') + with open(fname_out, 'w', encoding='utf-8') as f: + for x in res: + # LLaMA v3 for some reason strips the space for these tokens (and others) + # if x == 662: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # elif x == 1174: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # elif x == 2564: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # elif x == 758: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # elif x == 949: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # elif x == 5354: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # else: + # f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n') + # f.write(str(x) + ' \'' + tokenizer.decode(x).strip() + '\'\n') + f.write(str(x) + '\n') + print('len(res): ', len(res)) + print('len(lines): ', len(lines)) +print('results written to: ', fname_out) diff --git a/tests/test-tokenizer-0.sh b/tests/test-tokenizer-0.sh new file mode 100755 index 000000000..2fb8632d8 --- /dev/null +++ b/tests/test-tokenizer-0.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# +# Usage: +# +# test-tokenizer-0.sh +# + +if [ $# -ne 2 ]; then + printf "Usage: $0 \n" + exit 1 +fi + +name=$1 +input=$2 + +make -j tests/test-tokenizer-0 + +printf "Testing %s on %s ...\n" $name $input + +python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1 +cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in" + +./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1 +cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in" + +diff $input.tok $input.tokcpp > /dev/null 2>&1 + +if [ $? -eq 0 ]; then + printf "Tokenization is correct!\n" +else + diff $input.tok $input.tokcpp | head -n 32 + + printf "Tokenization differs!\n" +fi