diff --git a/models/ggml-vocab-phi-3.gguf b/models/ggml-vocab-phi-3.gguf index 72fdb409c..f8022a385 100644 Binary files a/models/ggml-vocab-phi-3.gguf and b/models/ggml-vocab-phi-3.gguf differ diff --git a/tests/test-tokenizer-0-bpe.py b/tests/test-tokenizer-0-bpe.py deleted file mode 100644 index 33a272441..000000000 --- a/tests/test-tokenizer-0-bpe.py +++ /dev/null @@ -1,117 +0,0 @@ -# tests with BPE tokenizer -# -# sample usage: -# -# python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/Meta-Llama-3-8B-Instruct/ -# python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/falcon-7b/ -# python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/deepseek-coder-6.7b-instruct/ -# - -import argparse - -from transformers import AutoTokenizer - -parser = argparse.ArgumentParser() -parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") -parser.add_argument("--fname-tok", help="path to a text file to tokenize") -args = parser.parse_args() - -dir_tokenizer = args.dir_tokenizer - -tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) - -tests = [ - "", - " ", - " ", - " ", - "\t", - "\n", - "\n\n", - "\n\n\n", - "\t\n", - "Hello world", - " Hello world", - "Hello World", - " Hello World", - " Hello World!", - "Hello, world!", - " Hello, world!", - " this is 🦙.cpp", - "w048 7tuijk dsdfhu", - "нещо на Български", - "កាន់តែពិសេសអាចខលចេញ", - "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", - "Hello", - " Hello", - " Hello", - " Hello", - " Hello", - " Hello\n Hello", - " (", - "\n =", - "' era", - "Hello, y'all! How are you 😁 ?我想在apple工作1314151天~", - "3", - "33", - "333", - "3333", - "33333", - "333333", - "3333333", - "33333333", - "333333333", -] - -for text in tests: - print('text: ', text) - print(tokenizer.encode(text)) - print(tokenizer.decode(tokenizer.encode(text))) - -print("\n\ntests for C++:\n") -for text in tests: - res = tokenizer.encode(text) - - k = text.replace('\n', '\\n') - k = k.replace('\t', '\\t') - k = '"' + k + '"' - print("{ %-24s, { " % k, end='') - for x in res: - print("%7d," % x, end='') - print(" }, },") - -print(tokenizer.encode('hello')) -print(tokenizer.encode('world')) -print(tokenizer.encode(' world')) -print(tokenizer.encode('hello world')) - -fname_tok = args.fname_tok -if fname_tok: - print('tokenizing file: ', fname_tok) - fname_out = fname_tok + '.tok' - with open(fname_tok, 'r', encoding='utf-8') as f: - lines = f.readlines() - s = ''.join(lines) - res = tokenizer.encode(s) - # write to file - with open(fname_out, 'w', encoding='utf-8') as f: - for x in res: - # LLaMA v3 for some reason strips the space for these tokens (and others) - # if x == 662: - # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') - # elif x == 1174: - # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') - # elif x == 2564: - # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') - # elif x == 758: - # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') - # elif x == 949: - # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') - # elif x == 5354: - # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') - # else: - # f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n') - f.write(str(x) + ' \'' + tokenizer.decode(x).strip() + '\'\n') - print('len(res): ', len(res)) - print('len(lines): ', len(lines)) - print('results written to: ', fname_out) diff --git a/tests/test-tokenizer-0-spm.py b/tests/test-tokenizer-0-spm.py deleted file mode 100644 index be12a6b93..000000000 --- a/tests/test-tokenizer-0-spm.py +++ /dev/null @@ -1,114 +0,0 @@ -# tests with SPM tokenizer -# -# sample usage: -# -# python3 tests/test-tokenizer-0-spm.py ~/Data/huggingface/Llama-2-7b-hf/ -# python3 tests/test-tokenizer-0-spm.py ~/Data/huggingface/CodeLlama-34b-Instruct-hf/ -# - - -import argparse - -from sentencepiece import SentencePieceProcessor - -parser = argparse.ArgumentParser() -parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") -parser.add_argument("--fname-tok", help="path to a text file to tokenize") -args = parser.parse_args() - -dir_tokenizer = args.dir_tokenizer - -tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model') - -tests = [ - "", - " ", - " ", - " ", - "\t", - "\n", - "\n\n", - "\n\n\n", - "\t\n", - "Hello world", - " Hello world", - "Hello World", - " Hello World", - " Hello World!", - "Hello, world!", - " Hello, world!", - " this is 🦙.cpp", - "w048 7tuijk dsdfhu", - "нещо на Български", - "កាន់តែពិសេសអាចខលចេញ", - "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", - "Hello", - " Hello", - " Hello", - " Hello", - " Hello", - " Hello\n Hello", - " (", - "\n =", - "' era", - "Hello, y'all! How are you 😁 ?我想在apple工作1314151天~", - "3", - "33", - "333", - "3333", - "33333", - "333333", - "3333333", - "33333333", - "333333333", -] - - -for text in tests: - print('text: ', text) - print('\nwith bos:') - print(tokenizer.encode(text, add_bos=True)) - print(tokenizer.decode(tokenizer.encode(text, add_bos=True))) - print('\nwithout bos:') - print(tokenizer.encode(text, add_bos=False)) - print(tokenizer.decode(tokenizer.encode(text, add_bos=False))) - -print("'" + tokenizer.id_to_piece(15043) + "'") # '_Hello' -print("'" + tokenizer.id_to_piece(29871) + "'") # '_' -print("'" + tokenizer.decode([15043]) + "'") # 'Hello' -print("'" + tokenizer.decode([15043, 15043]) + "'") # 'Hello Hello' -print("'" + tokenizer.decode([29871, 15043]) + "'") # ' Hello' -print("'" + tokenizer.decode([29871, 15043, 29871, 15043]) + "'") # ' Hello Hello' - -print("\n\ntests for C++:\n") -for text in tests: - res = tokenizer.encode(text, add_bos=False) - - k = text.replace('\n', '\\n') - k = k.replace('\t', '\\t') - k = '"' + k + '"' - print("{ %-24s, { " % k, end='') - for x in res: - print("%7d," % x, end='') - print(" }, },") - -print(tokenizer.encode('hello')) -print(tokenizer.encode('world')) -print(tokenizer.encode(' world')) -print(tokenizer.encode('hello world')) - -fname_tok = args.fname_tok -if fname_tok: - print('tokenizing file: ', fname_tok) - fname_out = fname_tok + '.tok' - with open(fname_tok, 'r', encoding='utf-8') as f: - lines = f.readlines() - s = ''.join(lines) - res = tokenizer.encode(s, add_bos=True) - # write to file - with open(fname_out, 'w', encoding='utf-8') as f: - for x in res: - f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n') - print('len(res): ', len(res)) - print('len(lines): ', len(lines)) - print('results written to: ', fname_out) diff --git a/tests/test-tokenizer-random-bpe.py b/tests/test-tokenizer-random-bpe.py index c2ad210aa..379154033 100644 --- a/tests/test-tokenizer-random-bpe.py +++ b/tests/test-tokenizer-random-bpe.py @@ -5,13 +5,16 @@ # python3 tests/test-tokenizer-0-bpe.py ./models/ggml-vocab-llama-bpe.gguf ~/Data/huggingface/Meta-Llama-3-8B-Instruct/ # -import random +import logging import argparse import subprocess +import random import cffi from transformers import AutoTokenizer, PreTrainedTokenizerBase +logger = logging.getLogger("test-tokenizer-random-bpe") + class LibLlama: @@ -152,11 +155,11 @@ def test_custom_texts(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBase): for text in tests+more_tests: ids1 = model.tokenize(text, parse_special=True) ids2 = tokenizer.encode(text) - print(repr(text)) + logger.info(repr(text)) if ids1 != ids2: - print(" TokenIDs:", list(ids1)) - print(" Expected:", list(ids2)) - print(" Index:", find_first_mismatch(ids1, ids2) ) + logger.info(" TokenIDs: " + str(list(ids1))) + logger.info(" Expected: " + str(list(ids2))) + logger.info(" Index: %d" % find_first_mismatch(ids1, ids2)) raise Exception() @@ -171,11 +174,11 @@ def test_random_chars(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBase, it .-,*/-+ª!"·$%&/()=?¿[]{}<>\\|@#~½¬~;:_ """)) - print( "Bruteforce random chars encodings ..." ) + logger.info("Bruteforce random chars encodings ...") rand = random.Random() for m in range(iterations): - print(m) + logger.debug("%d/%d" % (m+1,iterations)) rand.seed(m) text = [] @@ -194,17 +197,17 @@ def test_random_chars(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBase, it def test_random_vocab_chars(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBase, iterations=100): - print( "Building vocab char list ..." ) + logger.info("Building vocab char list ...") vocab_ids = list(tokenizer.vocab.values()) vocab_text = tokenizer.decode(vocab_ids) vocab_chars = list(set(vocab_text)) del vocab_ids, vocab_text - print( "Bruteforce random text encodings ..." ) + logger.info("Bruteforce random text encodings ...") rand = random.Random() for m in range(iterations): - print(m) + logger.debug("%d/%d" % (m+1,iterations)) rand.seed(m) text = rand.choices(vocab_chars, k=1024) @@ -212,12 +215,12 @@ def test_random_vocab_chars(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBa ids1 = model.tokenize(text, parse_special=True) ids2 = tokenizer.encode(text) - assert( ids1 == ids2 ) + assert(ids1 == ids2) def test_random_vocab_tokens(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBase, iterations=100): - print( "Building token list ..." ) + logger.info("Building token list ...") space_id = tokenizer.encode(" ")[0] vocab_ids = list(tokenizer.vocab.values()) vocab_ids = list(sorted(vocab_ids + vocab_ids)) @@ -227,17 +230,17 @@ def test_random_vocab_tokens(model:LibLlamaModel, tokenizer:PreTrainedTokenizerB vocab_tokens = vocab_tokens.split(" ") del vocab_ids - print( "Checking single token encodings ..." ) + logger.info("Checking single token encodings ...") for token in vocab_tokens: ids1 = model.tokenize(token, parse_special=True) ids2 = tokenizer.encode(token) assert(ids1 == ids2) - print( "Bruteforce random text encodings ..." ) + logger.info("Bruteforce random text encodings ...") rand = random.Random() for m in range(iterations): - print(m) + logger.debug("%d/%d" % (m+1,iterations)) rand.seed(m) text = [] @@ -252,18 +255,18 @@ def test_random_vocab_tokens(model:LibLlamaModel, tokenizer:PreTrainedTokenizerB ids1 = model.tokenize(text, parse_special=True) ids2 = tokenizer.encode(text) - assert( ids1 == ids2 ) + assert(ids1 == ids2) def test_random_bytes(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBase, iterations=100): WHITESPACES = list(" "*20 + "\n"*5 + "\r\n"*5 + "\t"*5) - print( "Bruteforce random bytes encodings ..." ) + logger.info("Bruteforce random bytes encodings ...") rand = random.Random() for m in range(iterations): - print(m) + logger.debug("%d/%d" % (m+1,iterations)) rand.seed(m) text = [] @@ -285,8 +288,11 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("vocab_file", help="path to vocab 'gguf' file") parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") + parser.add_argument("--verbose", action="store_true", help="increase output verbosity") args = parser.parse_args() + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + model = LibLlamaModel(LibLlama(), args.vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=2048)) tokenizer = AutoTokenizer.from_pretrained(args.dir_tokenizer)