Update bruteforce random tests

Add detokenizer checks
New generator: ascii_lr_strip
New generator: apostrophe
Add more vocabs files
This commit is contained in:
jaime-m-p 2024-06-20 21:41:37 +02:00
parent 071bf42f23
commit 064b35eaff

View file

@ -13,7 +13,7 @@ import subprocess
import random import random
import unicodedata import unicodedata
from typing import Callable, Iterator from typing import Iterator
import cffi import cffi
from transformers import AutoTokenizer from transformers import AutoTokenizer
@ -79,6 +79,7 @@ class LibLlamaModel:
raise RuntimeError("error: failed to create context for model '%s'" % path_model) raise RuntimeError("error: failed to create context for model '%s'" % path_model)
n_tokens_max = self.lib.llama_n_ctx(self.ctx) n_tokens_max = self.lib.llama_n_ctx(self.ctx)
self.token_ids = self.ffi.new("llama_token[]", n_tokens_max) self.token_ids = self.ffi.new("llama_token[]", n_tokens_max)
self.text_buff = self.ffi.new("uint8_t[]", 1024)
def free(self): def free(self):
if self.ctx: if self.ctx:
@ -89,14 +90,78 @@ class LibLlamaModel:
self.model = None self.model = None
self.lib = None self.lib = None
def tokenize(self, text: str, n_tokens_max: int = 0, add_special: bool = False, parse_special: bool = False) -> list[int]: def tokenize(self, text: str, add_special: bool = False, parse_special: bool = False) -> list[int]:
n_tokens_max = n_tokens_max if n_tokens_max > 0 else len(self.token_ids)
text = text.encode("utf-8") text = text.encode("utf-8")
num = self.lib.llama_tokenize(self.model, text, len(text), self.token_ids, n_tokens_max, add_special, parse_special) num = self.lib.llama_tokenize(self.model, text, len(text), self.token_ids, len(self.token_ids), add_special, parse_special)
if num < 0: while num < 0 and len(self.token_ids) < (16 << 20):
return [] self.token_ids = self.ffi.new("llama_token[]", -2 * num)
num = self.lib.llama_tokenize(self.model, text, len(text), self.token_ids, len(self.token_ids), add_special, parse_special)
return list(self.token_ids[0:num]) return list(self.token_ids[0:num])
def detokenize(self, ids: list[int], special: bool = False) -> str:
if len(self.token_ids) < len(ids):
self.token_ids = self.ffi.new("llama_token[]", 2 * len(ids))
for i, id in enumerate(ids):
self.token_ids[i] = id
num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), special)
while num < 0 and len(self.text_buff) < (16 << 20):
self.text_buff = self.ffi.new("uint8_t[]", -2 * num)
num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), special)
return str(self.ffi.buffer(self.text_buff, num), encoding="utf-8")
class Tokenizer:
def encode(self, text: str) -> list[int]:
raise NotImplementedError
def decode(self, ids: list[int]) -> str:
raise NotImplementedError
class TokenizerGroundtruth (Tokenizer):
def __init__(self, dir_tokenizer: str):
self.model = AutoTokenizer.from_pretrained(dir_tokenizer)
# guess BOS and EOS
ids = self.encode("a")
assert 1 <= len(ids) <= 3
add_bos_token = len(ids) > 1 and self.model.bos_token_id == ids[0]
add_eos_token = len(ids) > 1 and self.model.eos_token_id == ids[-1]
self.add_bos_token = getattr(self.model, "add_bos_token", add_bos_token)
self.add_eos_token = getattr(self.model, "add_eos_token", add_eos_token)
# build vocab
tokens = list(self.model.get_vocab().values())
self.vocab = self.model.batch_decode(tokens, skip_special_tokens=True)
self.vocab = list(sorted(self.vocab))
# tokens and lists
self.special_tokens = list(self.model.all_special_tokens)
self.added_tokens = list(self.model.added_tokens_encoder)
self.bos_token = self.model.bos_token
self.eos_token = self.model.eos_token
def encode(self, text: str) -> list[int]:
return self.model.encode(text, add_special_tokens=True)
def decode(self, ids: list[int]) -> str:
return self.model.decode(ids, skip_special_tokens=True)
class TokenizerLlamaCpp (Tokenizer):
libllama: LibLlama = None
def __init__(self, vocab_file: str):
if not self.libllama:
self.libllama = LibLlama()
self.model = LibLlamaModel(self.libllama, vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=4096))
def encode(self, text: str) -> list[int]:
return self.model.tokenize(text, add_special=True, parse_special=True)
def decode(self, ids: list[int]) -> str:
return self.model.detokenize(ids, special=False)
def generator_custom_text() -> Iterator[str]: def generator_custom_text() -> Iterator[str]:
"""General tests""" """General tests"""
@ -165,19 +230,43 @@ def generator_custom_text_edge_cases() -> Iterator[str]:
'a </s> b', # rstrip phi-3 'a </s> b', # rstrip phi-3
'a <mask> b', # lstrip jina-v2 'a <mask> b', # lstrip jina-v2
'\xa0aC', # deepseek '\xa0aC', # deepseek
'\u2029 \uA3E4', # deepseek-llm
"a ?",
] ]
def generator_vocab_words(vocab: list[str]) -> Iterator[str]: def generator_vocab_words(tokenizer: TokenizerGroundtruth) -> Iterator[str]:
"""Brute force check all vocab words""" """Brute force check all vocab words"""
yield from vocab yield from tokenizer.vocab
def generator_added_lr_strip(tokenizer) -> Iterator[str]: def generator_ascii_lr_strip() -> Iterator[str]:
WHITESPACES = ["", " ", " "]
CHARACTERS = list(chr(i) for i in range(1, 0x80)) + [""]
for char1 in CHARACTERS:
for char2 in CHARACTERS:
for lstrip in WHITESPACES:
for rstrip in WHITESPACES:
yield lstrip + char1 + char2 + rstrip
yield lstrip + char1 + rstrip + char2
yield char1 + lstrip + char2 + rstrip
def generator_apostrophe() -> Iterator[str]:
WHITESPACES = ["", " ", " "]
CHARACTERS = list(chr(i) for i in range(1, 0x80)) + [""]
for char1 in CHARACTERS:
for char2 in CHARACTERS:
for lstrip in WHITESPACES:
for rstrip in WHITESPACES:
yield char1 + lstrip + "'" + rstrip + char2
yield char1 + char2 + lstrip + "'" + rstrip + "z"
yield "a" + lstrip + "'" + rstrip + char1 + char2
def generator_added_lr_strip(tokenizer: TokenizerGroundtruth) -> Iterator[str]:
WHITESPACES = ["", " ", " ", " "] WHITESPACES = ["", " ", " ", " "]
special_tokens = list(tokenizer.all_special_tokens) all_tokens = list(sorted(set(tokenizer.special_tokens + tokenizer.added_tokens)))
added_tokens = list(tokenizer.added_tokens_encoder)
all_tokens = list(sorted(set(special_tokens + added_tokens)))
for token in all_tokens: for token in all_tokens:
for lstrip in WHITESPACES: for lstrip in WHITESPACES:
for rstrip in WHITESPACES: for rstrip in WHITESPACES:
@ -187,11 +276,9 @@ def generator_added_lr_strip(tokenizer) -> Iterator[str]:
yield "a" + lstrip + token + rstrip + "z" yield "a" + lstrip + token + rstrip + "z"
def generator_random_added_tokens(tokenizer, iterations=100) -> Iterator[str]: def generator_random_added_tokens(tokenizer: TokenizerGroundtruth, iterations=100) -> Iterator[str]:
special_tokens = list(tokenizer.all_special_tokens) separations = [" ", "\n", "\t", "-", "!", "one", "1", "<s>", "</s>"]
added_tokens = list(tokenizer.added_tokens_encoder) all_tokens = list(sorted(set(tokenizer.special_tokens + tokenizer.added_tokens + separations)))
separations = [" ", "\n", "\t", "-", "!", "one", "1", "<s>", "</s>"]
all_tokens = list(sorted(set(special_tokens + added_tokens + separations)))
rand = random.Random() rand = random.Random()
for m in range(iterations): for m in range(iterations):
rand.seed(m) rand.seed(m)
@ -244,11 +331,13 @@ def generator_unicodes() -> Iterator[str]:
return False return False
if 0x00D800 <= cpt <= 0x00F8FF: # Surrogates if 0x00D800 <= cpt <= 0x00F8FF: # Surrogates
return False return False
# if cpt == 0x2029: # deepseek-llm
# return False
if unicodedata.category(chr(cpt)) == "Cn": if unicodedata.category(chr(cpt)) == "Cn":
return False return False
return True return True
characters = [chr(cpt) for cpt in range(1, MAX_CODEPOINTS) if _valid(cpt)] characters = [chr(cpt) for cpt in range(0, MAX_CODEPOINTS) if _valid(cpt)]
yield from characters yield from characters
@ -273,11 +362,11 @@ def generator_random_unicodes(iterations=100) -> Iterator[str]:
yield "".join(text) yield "".join(text)
def generator_random_vocab_chars(vocab: list[str], iterations=100) -> Iterator[str]: def generator_random_vocab_chars(tokenizer: TokenizerGroundtruth, iterations=100) -> Iterator[str]:
"""Brute force random text with vocab characters""" """Brute force random text with vocab characters"""
vocab_chars = set() vocab_chars = set()
for word in vocab: for word in tokenizer.vocab:
vocab_chars.update(word) vocab_chars.update(word)
vocab_chars = list(sorted(vocab_chars)) vocab_chars = list(sorted(vocab_chars))
@ -288,10 +377,10 @@ def generator_random_vocab_chars(vocab: list[str], iterations=100) -> Iterator[s
yield "".join(text) yield "".join(text)
def generator_random_vocab_words(vocab: list[str], iterations=100) -> Iterator[str]: def generator_random_vocab_words(tokenizer: TokenizerGroundtruth, iterations=100) -> Iterator[str]:
"""Brute force random text from vocab words""" """Brute force random text from vocab words"""
vocab = [w.strip() for w in vocab] vocab = [w.strip() for w in tokenizer.vocab]
yield from vocab yield from vocab
rand = random.Random() rand = random.Random()
@ -307,7 +396,7 @@ def generator_random_vocab_words(vocab: list[str], iterations=100) -> Iterator[s
yield "".join(text) yield "".join(text)
def compare_tokenizers(func_tokenize1: Callable, func_tokenize2: Callable, generator: Iterator[str]): def compare_tokenizers(tokenizer1: Tokenizer, tokenizer2: Tokenizer, generator: Iterator[str]):
def find_first_mismatch(ids1: list[int], ids2: list[int]): def find_first_mismatch(ids1: list[int], ids2: list[int]):
for i, (a, b) in enumerate(zip(ids1, ids2)): for i, (a, b) in enumerate(zip(ids1, ids2)):
@ -317,34 +406,51 @@ def compare_tokenizers(func_tokenize1: Callable, func_tokenize2: Callable, gener
return -1 return -1
return min(len(ids1), len(ids2)) return min(len(ids1), len(ids2))
t_tokenizer1 = 0 t_encode1 = 0
t_tokenizer2 = 0 t_encode2 = 0
t_decode1 = 0
t_decode2 = 0
t_start = time.perf_counter() t_start = time.perf_counter()
num_errors = 10 num_errors = 0
logger.info("%s: %s" % (generator.__name__, "ini")) logger.info("%s: %s" % (generator.__name__, "ini"))
for text in generator: for text in generator:
# print(repr(text), text.encode())
# print(repr(text), hex(ord(text[0])), text.encode()) # print(repr(text), hex(ord(text[0])), text.encode())
t0 = time.perf_counter() t0 = time.perf_counter()
ids1 = func_tokenize1(text) ids1 = tokenizer1.encode(text)
t1 = time.perf_counter() t1 = time.perf_counter()
ids2 = func_tokenize2(text) ids2 = tokenizer2.encode(text)
t2 = time.perf_counter() t2 = time.perf_counter()
t_tokenizer1 += t1 - t0 text1 = tokenizer1.decode(ids1)
t_tokenizer2 += t2 - t1 t3 = time.perf_counter()
text2 = tokenizer2.decode(ids2)
t4 = time.perf_counter()
t_encode1 += t1 - t0
t_encode2 += t2 - t1
t_decode1 += t3 - t2
t_decode2 += t4 - t3
if ids1 != ids2: if ids1 != ids2:
i = find_first_mismatch(ids1, ids2) i = find_first_mismatch(ids1, ids2)
ids1 = list(ids1)[max(0, i - 2) : i + 5 + 1] ids1 = list(ids1)[max(0, i - 2) : i + 5 + 1]
ids2 = list(ids2)[max(0, i - 2) : i + 5 + 1] ids2 = list(ids2)[max(0, i - 2) : i + 5 + 1]
logger.error(" TokenIDs: " + str(ids1)) logger.error(" Expected: " + str(ids1))
logger.error(" Expected: " + str(ids2)) logger.error(" Result: " + str(ids2))
# raise Exception()
num_errors += 1 num_errors += 1
if num_errors > 10: if text1 != text2 and text != text2:
break i = find_first_mismatch(text1, text2)
text1 = list(text1[max(0, i - 2) : i + 5 + 1])
text2 = list(text2[max(0, i - 2) : i + 5 + 1])
logger.error(" Expected: " + " ".join(hex(ord(x)) for x in text1))
logger.error(" Result: " + " ".join(hex(ord(x)) for x in text2))
num_errors += 1
if num_errors >= 10:
logger.error(f" EXIT: {num_errors=}")
# raise Exception()
break
t_total = time.perf_counter() - t_start t_total = time.perf_counter() - t_start
logger.info("%s: end, tok1: %.3f tok2: %.3f total: %.3f" % (generator.__name__, t_tokenizer1, t_tokenizer2, t_total)) logger.info(f"{generator.__name__}: end, {t_encode1=:.3f} {t_encode2=:.3f} {t_decode1=:.3f} {t_decode2=:.3f} {t_total=:.3f}")
def main(argv: list[str] = None): def main(argv: list[str] = None):
@ -357,74 +463,71 @@ def main(argv: list[str] = None):
logging.basicConfig(level = logging.DEBUG if args.verbose else logging.INFO) logging.basicConfig(level = logging.DEBUG if args.verbose else logging.INFO)
logger.info(f"VOCABFILE: '{args.vocab_file}'") logger.info(f"VOCABFILE: '{args.vocab_file}'")
model = LibLlamaModel(LibLlama(), args.vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=4096)) tokenizer1 = TokenizerGroundtruth(args.dir_tokenizer)
tokenizer = AutoTokenizer.from_pretrained(args.dir_tokenizer) tokenizer2 = TokenizerLlamaCpp(args.vocab_file)
def func_tokenize1(text: str): # compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text())
return model.tokenize(text, add_special=True, parse_special=True) # compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text_edge_cases())
compare_tokenizers(tokenizer1, tokenizer2, generator_ascii_lr_strip())
compare_tokenizers(tokenizer1, tokenizer2, generator_apostrophe())
compare_tokenizers(tokenizer1, tokenizer2, generator_unicodes())
compare_tokenizers(tokenizer1, tokenizer2, generator_vocab_words(tokenizer1))
compare_tokenizers(tokenizer1, tokenizer2, generator_added_lr_strip(tokenizer1))
# compare_tokenizers(tokenizer1, tokenizer2, generator_random_added_tokens(tokenizer1, 10_000))
# compare_tokenizers(tokenizer1, tokenizer2, generator_random_chars(10_000))
# compare_tokenizers(tokenizer1, tokenizer2, generator_random_unicodes(10_000))
# compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_chars(tokenizer1, 10_000))
# compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_words(tokenizer1, 5_000))
def func_tokenize2(text: str): tokenizer2.model.free()
return tokenizer.encode(text, add_special_tokens=True)
ids = func_tokenize2("a")
assert 1 <= len(ids) <= 3
add_bos_token = len(ids) > 1 and tokenizer.bos_token_id == ids[0]
add_eos_token = len(ids) > 1 and tokenizer.eos_token_id == ids[-1]
tokenizer.add_bos_token = getattr(tokenizer, "add_bos_token", add_bos_token)
tokenizer.add_eos_token = getattr(tokenizer, "add_eos_token", add_eos_token)
vocab = list(sorted(tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True)))
compare_tokenizers(func_tokenize1, func_tokenize2, generator_custom_text())
compare_tokenizers(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases())
compare_tokenizers(func_tokenize1, func_tokenize2, generator_unicodes())
compare_tokenizers(func_tokenize1, func_tokenize2, generator_vocab_words(vocab))
compare_tokenizers(func_tokenize1, func_tokenize2, generator_added_lr_strip(tokenizer))
compare_tokenizers(func_tokenize1, func_tokenize2, generator_random_added_tokens(tokenizer, 10_000))
compare_tokenizers(func_tokenize1, func_tokenize2, generator_random_chars(10_000))
compare_tokenizers(func_tokenize1, func_tokenize2, generator_random_unicodes(10_000))
compare_tokenizers(func_tokenize1, func_tokenize2, generator_random_vocab_chars(vocab, 10_000))
compare_tokenizers(func_tokenize1, func_tokenize2, generator_random_vocab_words(vocab, 5_000))
model.free()
if __name__ == "__main__": if __name__ == "__main__":
# main() # main()
if True:
logging.basicConfig(
level = logging.DEBUG,
format = "%(asctime)s.%(msecs)03d %(name)s %(levelname)s %(message)s",
datefmt = "%Y-%m-%d %H:%M:%S",
filename = logger.name + ".log",
filemode = "a"
)
logging.basicConfig( logging.basicConfig(
level = logging.DEBUG, level = logging.DEBUG,
format = "%(asctime)s.%(msecs)03d %(name)s %(levelname)s %(message)s", format = "%(levelname)s %(message)s",
datefmt = "%Y-%m-%d %H:%M:%S",
filename = logger.name + ".log",
filemode = "a"
) )
path_tokenizers = "./models/tokenizers/" path_tokenizers = "./models/tokenizers/"
path_vocab_format = "./models/ggml-vocab-%s.gguf" path_vocab_format = "./models/ggml-vocab-%s.gguf"
# import os
# tokenizers = os.listdir(path_tokenizers)
tokenizers = [ tokenizers = [
# "llama-spm", # SPM "llama-spm", # SPM
# "phi-3", # SPM "phi-3", # SPM
# "bert-bge", # WPM "bert-bge", # WPM
# "jina-v2-en", # WPM "jina-v2-en", # WPM
"gpt-2", # BPE
"llama-bpe", # BPE "llama-bpe", # BPE
"phi-2", # BPE
"deepseek-llm", # BPE
"deepseek-coder", # BPE
"falcon", # BPE "falcon", # BPE
"mpt", # BPE
"starcoder", # BPE "starcoder", # BPE
"gpt-2", # BPE
"stablelm2", # BPE
"refact", # BPE
"qwen2", # BPE
"olmo", # BPE
"jina-v2-es", # BPE "jina-v2-es", # BPE
"jina-v2-de", # BPE "jina-v2-de", # BPE
"jina-v2-code", # BPE
"smaug-bpe", # BPE "smaug-bpe", # BPE
"phi-2", # BPE "poro-chat", # BPE
"deepseek-coder", # BPE "jina-v2-code", # BPE
"deepseek-llm", # BPE
] ]
logger.info("=" * 50)
for tokenizer in tokenizers: for tokenizer in tokenizers:
logger.info("=" * 50) logger.info("-" * 50)
logger.info(f"TOKENIZER: '{tokenizer}'") logger.info(f"TOKENIZER: '{tokenizer}'")
vocab_file = path_vocab_format % tokenizer vocab_file = path_vocab_format % tokenizer
dir_tokenizer = path_tokenizers + "/" + tokenizer dir_tokenizer = path_tokenizers + "/" + tokenizer