Fix merge

This commit is contained in:
jaime-m-p 2024-05-05 00:42:44 +02:00
parent 798b576c06
commit 69a49ac3a1
4 changed files with 24 additions and 249 deletions

Binary file not shown.

View file

@ -1,117 +0,0 @@
# tests with BPE tokenizer
#
# sample usage:
#
# python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/Meta-Llama-3-8B-Instruct/
# python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/falcon-7b/
# python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/deepseek-coder-6.7b-instruct/
#
import argparse
from transformers import AutoTokenizer
parser = argparse.ArgumentParser()
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
parser.add_argument("--fname-tok", help="path to a text file to tokenize")
args = parser.parse_args()
dir_tokenizer = args.dir_tokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
tests = [
"",
" ",
" ",
" ",
"\t",
"\n",
"\n\n",
"\n\n\n",
"\t\n",
"Hello world",
" Hello world",
"Hello World",
" Hello World",
" Hello World!",
"Hello, world!",
" Hello, world!",
" this is 🦙.cpp",
"w048 7tuijk dsdfhu",
"нещо на Български",
"កាន់តែពិសេសអាចខលចេញ",
"🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
"Hello",
" Hello",
" Hello",
" Hello",
" Hello",
" Hello\n Hello",
" (",
"\n =",
"' era",
"Hello, y'all! How are you 😁 ?我想在apple工作1314151天",
"3",
"33",
"333",
"3333",
"33333",
"333333",
"3333333",
"33333333",
"333333333",
]
for text in tests:
print('text: ', text)
print(tokenizer.encode(text))
print(tokenizer.decode(tokenizer.encode(text)))
print("\n\ntests for C++:\n")
for text in tests:
res = tokenizer.encode(text)
k = text.replace('\n', '\\n')
k = k.replace('\t', '\\t')
k = '"' + k + '"'
print("{ %-24s, { " % k, end='')
for x in res:
print("%7d," % x, end='')
print(" }, },")
print(tokenizer.encode('hello'))
print(tokenizer.encode('world'))
print(tokenizer.encode(' world'))
print(tokenizer.encode('hello world'))
fname_tok = args.fname_tok
if fname_tok:
print('tokenizing file: ', fname_tok)
fname_out = fname_tok + '.tok'
with open(fname_tok, 'r', encoding='utf-8') as f:
lines = f.readlines()
s = ''.join(lines)
res = tokenizer.encode(s)
# write to file
with open(fname_out, 'w', encoding='utf-8') as f:
for x in res:
# LLaMA v3 for some reason strips the space for these tokens (and others)
# if x == 662:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# elif x == 1174:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# elif x == 2564:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# elif x == 758:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# elif x == 949:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# elif x == 5354:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# else:
# f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
f.write(str(x) + ' \'' + tokenizer.decode(x).strip() + '\'\n')
print('len(res): ', len(res))
print('len(lines): ', len(lines))
print('results written to: ', fname_out)

View file

@ -1,114 +0,0 @@
# tests with SPM tokenizer
#
# sample usage:
#
# python3 tests/test-tokenizer-0-spm.py ~/Data/huggingface/Llama-2-7b-hf/
# python3 tests/test-tokenizer-0-spm.py ~/Data/huggingface/CodeLlama-34b-Instruct-hf/
#
import argparse
from sentencepiece import SentencePieceProcessor
parser = argparse.ArgumentParser()
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
parser.add_argument("--fname-tok", help="path to a text file to tokenize")
args = parser.parse_args()
dir_tokenizer = args.dir_tokenizer
tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model')
tests = [
"",
" ",
" ",
" ",
"\t",
"\n",
"\n\n",
"\n\n\n",
"\t\n",
"Hello world",
" Hello world",
"Hello World",
" Hello World",
" Hello World!",
"Hello, world!",
" Hello, world!",
" this is 🦙.cpp",
"w048 7tuijk dsdfhu",
"нещо на Български",
"កាន់តែពិសេសអាចខលចេញ",
"🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
"Hello",
" Hello",
" Hello",
" Hello",
" Hello",
" Hello\n Hello",
" (",
"\n =",
"' era",
"Hello, y'all! How are you 😁 ?我想在apple工作1314151天",
"3",
"33",
"333",
"3333",
"33333",
"333333",
"3333333",
"33333333",
"333333333",
]
for text in tests:
print('text: ', text)
print('\nwith bos:')
print(tokenizer.encode(text, add_bos=True))
print(tokenizer.decode(tokenizer.encode(text, add_bos=True)))
print('\nwithout bos:')
print(tokenizer.encode(text, add_bos=False))
print(tokenizer.decode(tokenizer.encode(text, add_bos=False)))
print("'" + tokenizer.id_to_piece(15043) + "'") # '_Hello'
print("'" + tokenizer.id_to_piece(29871) + "'") # '_'
print("'" + tokenizer.decode([15043]) + "'") # 'Hello'
print("'" + tokenizer.decode([15043, 15043]) + "'") # 'Hello Hello'
print("'" + tokenizer.decode([29871, 15043]) + "'") # ' Hello'
print("'" + tokenizer.decode([29871, 15043, 29871, 15043]) + "'") # ' Hello Hello'
print("\n\ntests for C++:\n")
for text in tests:
res = tokenizer.encode(text, add_bos=False)
k = text.replace('\n', '\\n')
k = k.replace('\t', '\\t')
k = '"' + k + '"'
print("{ %-24s, { " % k, end='')
for x in res:
print("%7d," % x, end='')
print(" }, },")
print(tokenizer.encode('hello'))
print(tokenizer.encode('world'))
print(tokenizer.encode(' world'))
print(tokenizer.encode('hello world'))
fname_tok = args.fname_tok
if fname_tok:
print('tokenizing file: ', fname_tok)
fname_out = fname_tok + '.tok'
with open(fname_tok, 'r', encoding='utf-8') as f:
lines = f.readlines()
s = ''.join(lines)
res = tokenizer.encode(s, add_bos=True)
# write to file
with open(fname_out, 'w', encoding='utf-8') as f:
for x in res:
f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
print('len(res): ', len(res))
print('len(lines): ', len(lines))
print('results written to: ', fname_out)

View file

@ -5,13 +5,16 @@
# python3 tests/test-tokenizer-0-bpe.py ./models/ggml-vocab-llama-bpe.gguf ~/Data/huggingface/Meta-Llama-3-8B-Instruct/
#
import random
import logging
import argparse
import subprocess
import random
import cffi
from transformers import AutoTokenizer, PreTrainedTokenizerBase
logger = logging.getLogger("test-tokenizer-random-bpe")
class LibLlama:
@ -152,11 +155,11 @@ def test_custom_texts(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBase):
for text in tests+more_tests:
ids1 = model.tokenize(text, parse_special=True)
ids2 = tokenizer.encode(text)
print(repr(text))
logger.info(repr(text))
if ids1 != ids2:
print(" TokenIDs:", list(ids1))
print(" Expected:", list(ids2))
print(" Index:", find_first_mismatch(ids1, ids2) )
logger.info(" TokenIDs: " + str(list(ids1)))
logger.info(" Expected: " + str(list(ids2)))
logger.info(" Index: %d" % find_first_mismatch(ids1, ids2))
raise Exception()
@ -171,11 +174,11 @@ def test_random_chars(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBase, it
.-,*/-+ª!"·$%&/()=?¿[]{}<>\\|@#~½¬~;:_
"""))
print( "Bruteforce random chars encodings ..." )
logger.info("Bruteforce random chars encodings ...")
rand = random.Random()
for m in range(iterations):
print(m)
logger.debug("%d/%d" % (m+1,iterations))
rand.seed(m)
text = []
@ -194,17 +197,17 @@ def test_random_chars(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBase, it
def test_random_vocab_chars(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBase, iterations=100):
print( "Building vocab char list ..." )
logger.info("Building vocab char list ...")
vocab_ids = list(tokenizer.vocab.values())
vocab_text = tokenizer.decode(vocab_ids)
vocab_chars = list(set(vocab_text))
del vocab_ids, vocab_text
print( "Bruteforce random text encodings ..." )
logger.info("Bruteforce random text encodings ...")
rand = random.Random()
for m in range(iterations):
print(m)
logger.debug("%d/%d" % (m+1,iterations))
rand.seed(m)
text = rand.choices(vocab_chars, k=1024)
@ -212,12 +215,12 @@ def test_random_vocab_chars(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBa
ids1 = model.tokenize(text, parse_special=True)
ids2 = tokenizer.encode(text)
assert( ids1 == ids2 )
assert(ids1 == ids2)
def test_random_vocab_tokens(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBase, iterations=100):
print( "Building token list ..." )
logger.info("Building token list ...")
space_id = tokenizer.encode(" ")[0]
vocab_ids = list(tokenizer.vocab.values())
vocab_ids = list(sorted(vocab_ids + vocab_ids))
@ -227,17 +230,17 @@ def test_random_vocab_tokens(model:LibLlamaModel, tokenizer:PreTrainedTokenizerB
vocab_tokens = vocab_tokens.split(" ")
del vocab_ids
print( "Checking single token encodings ..." )
logger.info("Checking single token encodings ...")
for token in vocab_tokens:
ids1 = model.tokenize(token, parse_special=True)
ids2 = tokenizer.encode(token)
assert(ids1 == ids2)
print( "Bruteforce random text encodings ..." )
logger.info("Bruteforce random text encodings ...")
rand = random.Random()
for m in range(iterations):
print(m)
logger.debug("%d/%d" % (m+1,iterations))
rand.seed(m)
text = []
@ -252,18 +255,18 @@ def test_random_vocab_tokens(model:LibLlamaModel, tokenizer:PreTrainedTokenizerB
ids1 = model.tokenize(text, parse_special=True)
ids2 = tokenizer.encode(text)
assert( ids1 == ids2 )
assert(ids1 == ids2)
def test_random_bytes(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBase, iterations=100):
WHITESPACES = list(" "*20 + "\n"*5 + "\r\n"*5 + "\t"*5)
print( "Bruteforce random bytes encodings ..." )
logger.info("Bruteforce random bytes encodings ...")
rand = random.Random()
for m in range(iterations):
print(m)
logger.debug("%d/%d" % (m+1,iterations))
rand.seed(m)
text = []
@ -285,8 +288,11 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("vocab_file", help="path to vocab 'gguf' file")
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
model = LibLlamaModel(LibLlama(), args.vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=2048))
tokenizer = AutoTokenizer.from_pretrained(args.dir_tokenizer)