From 60fd27b68d0198c9d8e47949eb887afa13d55684 Mon Sep 17 00:00:00 2001 From: Robert Date: Tue, 12 Nov 2024 22:16:34 -0800 Subject: [PATCH 01/10] Update test-tokenizer-random.py Added try/except --- tests/test-tokenizer-random.py | 177 ++++++++++++++++++--------------- 1 file changed, 99 insertions(+), 78 deletions(-) diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index 9ebe6c891..98fd6e40b 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -1,30 +1,38 @@ -# Test libllama tokenizer == AutoTokenizer. -# Brute force random words/text generation. -# -# Sample usage: -# -# python3 tests/test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe -# +#!/usr/bin/env python3 +""" +Test libllama tokenizer against AutoTokenizer using brute force random words/text generation. + +Sample usage: + + python3 test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe +""" from __future__ import annotations - +# import time import logging import argparse +import shutil import subprocess import random import unicodedata - from pathlib import Path from typing import Any, Iterator, cast from typing_extensions import Buffer - +# +# External Imports import cffi from transformers import AutoTokenizer, PreTrainedTokenizer +# +#################################################################################################### +# +# Classes: logger = logging.getLogger("test-tokenizer-random") +if shutil.which("gcc") is None: + raise EnvironmentError("GCC is not available on this system. Please install GCC or use preprocessed headers.") class LibLlama: @@ -32,6 +40,12 @@ class LibLlama: DEFAULT_PATH_INCLUDES = ["./ggml/include/", "./include/"] DEFAULT_PATH_LIBLLAMA = "./build/src/libllama.so" # CMakeLists.txt: BUILD_SHARED_LIBS ON + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.free() + def __init__(self, path_llama_h: str | None = None, path_includes: list[str] = [], path_libllama: str | None = None): path_llama_h = path_llama_h or self.DEFAULT_PATH_LLAMA_H path_includes = path_includes or self.DEFAULT_PATH_INCLUDES @@ -408,83 +422,90 @@ def generator_random_vocab_words(tokenizer: TokenizerGroundtruth, iterations=100 def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLlamaCpp, generator: Iterator[str]): + try: + # def find_first_mismatch(ids1: list[int] | str, ids2: list[int] | str): + # for i, (a, b) in enumerate(zip(ids1, ids2)): + # if a != b: + # return i + # if len(ids1) == len(ids2): + # return -1 + # return min(len(ids1), len(ids2)) + # Rewritten to use zip() and next() instead of for loop + def find_first_mismatch(ids1: Sequence[Any], ids2: Sequence[Any]) -> int: + return next((i for i, (a, b) in enumerate(zip(ids1, ids2)) if a != b), -1) - def find_first_mismatch(ids1: list[int] | str, ids2: list[int] | str): - for i, (a, b) in enumerate(zip(ids1, ids2)): - if a != b: - return i - if len(ids1) == len(ids2): - return -1 - return min(len(ids1), len(ids2)) + def check_detokenizer(text: str, text1: str, text2: str) -> bool: + if text1 == text2: # equal to TokenizerGroundtruth? + return True + # equal to source text? + if tokenizer1.add_bos_token: # remove BOS + if text2.startswith(tokenizer1.bos_token): + text2 = text2[len(tokenizer1.bos_token):] + if tokenizer1.add_eos_token: # remove EOS + if text2.endswith(tokenizer1.eos_token): + text2 = text2[:-len(tokenizer1.eos_token)] + return text == text2 - def check_detokenizer(text: str, text1: str, text2: str) -> bool: - if text1 == text2: # equal to TokenizerGroundtruth? - return True - # equal to source text? - if tokenizer1.add_bos_token: # remove BOS - if text2.startswith(tokenizer1.bos_token): - text2 = text2[len(tokenizer1.bos_token):] - if tokenizer1.add_eos_token: # remove EOS - if text2.endswith(tokenizer1.eos_token): - text2 = text2[:-len(tokenizer1.eos_token)] - return text == text2 + t_encode1 = 0 + t_encode2 = 0 + t_decode1 = 0 + t_decode2 = 0 + t_start = time.perf_counter() + encode_errors = 0 + decode_errors = 0 + MAX_ERRORS = 10 - t_encode1 = 0 - t_encode2 = 0 - t_decode1 = 0 - t_decode2 = 0 - t_start = time.perf_counter() - encode_errors = 0 - decode_errors = 0 - MAX_ERRORS = 10 - - logger.info("%s: %s" % (generator.__qualname__, "ini")) - for text in generator: - # print(repr(text), text.encode()) - # print(repr(text), hex(ord(text[0])), text.encode()) - t0 = time.perf_counter() - ids1 = tokenizer1.encode(text) - t1 = time.perf_counter() - ids2 = tokenizer2.encode(text) - t2 = time.perf_counter() - text1 = tokenizer1.decode(ids1) - t3 = time.perf_counter() - text2 = tokenizer2.decode(ids1) - t4 = time.perf_counter() - t_encode1 += t1 - t0 - t_encode2 += t2 - t1 - t_decode1 += t3 - t2 - t_decode2 += t4 - t3 - if encode_errors < MAX_ERRORS and ids1 != ids2: - i = find_first_mismatch(ids1, ids2) - ids1 = list(ids1)[max(0, i - 2) : i + 5 + 1] - ids2 = list(ids2)[max(0, i - 2) : i + 5 + 1] - logger.error(" Expected: " + str(ids1)) - logger.error(" Result: " + str(ids2)) - encode_errors += 1 - logger.error(f" {encode_errors=}") - if decode_errors < MAX_ERRORS and not check_detokenizer(text, text1, text2): - i = find_first_mismatch(text1, text2) - text1 = list(text1[max(0, i - 2) : i + 5 + 1]) - text2 = list(text2[max(0, i - 2) : i + 5 + 1]) - logger.error(" Expected: " + " ".join(hex(ord(x)) for x in text1)) - logger.error(" Result: " + " ".join(hex(ord(x)) for x in text2)) - decode_errors += 1 - logger.error(f" {decode_errors=}") - if encode_errors >= MAX_ERRORS and decode_errors >= MAX_ERRORS: - logger.error(f" EXIT: {encode_errors=} {decode_errors=}") - # raise Exception() - break - - t_total = time.perf_counter() - t_start - logger.info(f"{generator.__qualname__}: end, {t_encode1=:.3f} {t_encode2=:.3f} {t_decode1=:.3f} {t_decode2=:.3f} {t_total=:.3f}") + logger.info("%s: %s" % (generator.__qualname__, "ini")) + for text in generator: + # print(repr(text), text.encode()) + # print(repr(text), hex(ord(text[0])), text.encode()) + t0 = time.perf_counter() + ids1 = tokenizer1.encode(text) + t1 = time.perf_counter() + ids2 = tokenizer2.encode(text) + t2 = time.perf_counter() + text1 = tokenizer1.decode(ids1) + t3 = time.perf_counter() + text2 = tokenizer2.decode(ids1) + t4 = time.perf_counter() + t_encode1 += t1 - t0 + t_encode2 += t2 - t1 + t_decode1 += t3 - t2 + t_decode2 += t4 - t3 + if encode_errors < MAX_ERRORS and ids1 != ids2: + i = find_first_mismatch(ids1, ids2) + ids1 = list(ids1)[max(0, i - 2) : i + 5 + 1] + ids2 = list(ids2)[max(0, i - 2) : i + 5 + 1] + logger.error(" Expected: " + str(ids1)) + logger.error(" Result: " + str(ids2)) + encode_errors += 1 + logger.error(f" {encode_errors=}") + if decode_errors < MAX_ERRORS and not check_detokenizer(text, text1, text2): + i = find_first_mismatch(text1, text2) + text1 = list(text1[max(0, i - 2) : i + 5 + 1]) + text2 = list(text2[max(0, i - 2) : i + 5 + 1]) + logger.error(" Expected: " + " ".join(hex(ord(x)) for x in text1)) + logger.error(" Result: " + " ".join(hex(ord(x)) for x in text2)) + decode_errors += 1 + logger.error(f" {decode_errors=}") + if encode_errors >= MAX_ERRORS and decode_errors >= MAX_ERRORS: + logger.error(f" EXIT: {encode_errors=} {decode_errors=}") + # raise Exception() + break + t_total = time.perf_counter() - t_start + logger.info(f"{generator.__qualname__}: end, {t_encode1=:.3f} {t_encode2=:.3f} {t_decode1=:.3f} {t_decode2=:.3f} {t_total=:.3f}") + except Exception as e: + logger.exception(f"An error occurred during tokenizer comparison: {e}") def main(argv: list[str] | None = None): parser = argparse.ArgumentParser() parser.add_argument("vocab_file", type=str, help="path to vocab 'gguf' file") parser.add_argument("dir_tokenizer", type=str, help="directory containing 'tokenizer.model' file") parser.add_argument("--verbose", action="store_true", help="increase output verbosity") + parser.add_argument("--max-errors", type=int, default=10, help="Maximum number of errors before stopping") + parser.add_argument("--iterations", type=int, default=100, help="Number of iterations for random generators") + parser.add_argument("--tokenizers", nargs="+", help="List of tokenizers to test", default=tokenizers) args = parser.parse_args(argv) logging.basicConfig(level = logging.DEBUG if args.verbose else logging.INFO) @@ -563,4 +584,4 @@ if __name__ == "__main__": logger.info(f"TOKENIZER: '{tokenizer}'") vocab_file = Path(path_vocab_format % tokenizer) dir_tokenizer = path_tokenizers / tokenizer - main([str(vocab_file), str(dir_tokenizer), "--verbose"]) + main([str(vocab_file), str(dir_tokenizer), "--verbose"]) \ No newline at end of file From db26ba5b5cf6d73e5e219e1cd597fac3659eb446 Mon Sep 17 00:00:00 2001 From: Robert Date: Tue, 12 Nov 2024 22:24:03 -0800 Subject: [PATCH 02/10] Update test-tokenizer-random.py --- tests/test-tokenizer-random.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index 98fd6e40b..752f1f902 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 """ -Test libllama tokenizer against AutoTokenizer using brute force random words/text generation. +# Test libllama tokenizer == AutoTokenizer. +# Brute force random words/text generation. Sample usage: @@ -584,4 +585,4 @@ if __name__ == "__main__": logger.info(f"TOKENIZER: '{tokenizer}'") vocab_file = Path(path_vocab_format % tokenizer) dir_tokenizer = path_tokenizers / tokenizer - main([str(vocab_file), str(dir_tokenizer), "--verbose"]) \ No newline at end of file + main([str(vocab_file), str(dir_tokenizer), "--verbose"]) From 82a4012c2aa4d644e205a91f65d58b5464daa31b Mon Sep 17 00:00:00 2001 From: Robert Date: Tue, 12 Nov 2024 22:24:35 -0800 Subject: [PATCH 03/10] Update test-tokenizer-random.py --- tests/test-tokenizer-random.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index 752f1f902..5b95e3f09 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -25,10 +25,6 @@ from typing_extensions import Buffer import cffi from transformers import AutoTokenizer, PreTrainedTokenizer # -#################################################################################################### -# -# Classes: - logger = logging.getLogger("test-tokenizer-random") From 15748844833b944b159c6b70e6af4d2d05f1108a Mon Sep 17 00:00:00 2001 From: Robert Date: Tue, 12 Nov 2024 22:26:56 -0800 Subject: [PATCH 04/10] Update test-tokenizer-random.py --- tests/test-tokenizer-random.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index 5b95e3f09..1ecc6a158 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -1,12 +1,12 @@ #!/usr/bin/env python3 -""" +# # Test libllama tokenizer == AutoTokenizer. # Brute force random words/text generation. - -Sample usage: - - python3 test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe -""" +# +#Sample usage: +# +# python3 test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe +# from __future__ import annotations # From 18489671bff4ec0ca82bdad8b6f3f8ced46ee235 Mon Sep 17 00:00:00 2001 From: Robert Date: Tue, 12 Nov 2024 22:27:41 -0800 Subject: [PATCH 05/10] Update test-tokenizer-random.py --- tests/test-tokenizer-random.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index 1ecc6a158..1a1cc33b8 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -5,7 +5,7 @@ # #Sample usage: # -# python3 test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe +# python3 tests/test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe # from __future__ import annotations From 3275e293604d18bd46d89356eb95a7123ce3c63d Mon Sep 17 00:00:00 2001 From: Robert Date: Tue, 12 Nov 2024 22:28:30 -0800 Subject: [PATCH 06/10] Update test-tokenizer-random.py --- tests/test-tokenizer-random.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index 1a1cc33b8..60aa64838 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -5,7 +5,7 @@ # #Sample usage: # -# python3 tests/test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe +# python3 tests/test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe # from __future__ import annotations From 5edd022d6a0ccbef97939257af6136fa6e4cccaa Mon Sep 17 00:00:00 2001 From: Robert Date: Tue, 12 Nov 2024 22:28:52 -0800 Subject: [PATCH 07/10] Update test-tokenizer-random.py --- tests/test-tokenizer-random.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index 60aa64838..6bc782b96 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -3,13 +3,13 @@ # Test libllama tokenizer == AutoTokenizer. # Brute force random words/text generation. # -#Sample usage: +# Sample usage: # # python3 tests/test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe # from __future__ import annotations -# + import time import logging import argparse From 235a268f96b480ee88308b789261c1b6b04d0a97 Mon Sep 17 00:00:00 2001 From: Robert Date: Wed, 13 Nov 2024 07:49:38 -0800 Subject: [PATCH 08/10] Update test-tokenizer-random.py Added blank lines for Lint test; Added sequence import from typing Removed 'free' call from Object --- tests/test-tokenizer-random.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index 6bc782b96..2bc14e23f 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -18,7 +18,7 @@ import subprocess import random import unicodedata from pathlib import Path -from typing import Any, Iterator, cast +from typing import Any, Iterator, cast, Sequence from typing_extensions import Buffer # # External Imports @@ -31,18 +31,13 @@ logger = logging.getLogger("test-tokenizer-random") if shutil.which("gcc") is None: raise EnvironmentError("GCC is not available on this system. Please install GCC or use preprocessed headers.") + class LibLlama: DEFAULT_PATH_LLAMA_H = "./include/llama.h" DEFAULT_PATH_INCLUDES = ["./ggml/include/", "./include/"] DEFAULT_PATH_LIBLLAMA = "./build/src/libllama.so" # CMakeLists.txt: BUILD_SHARED_LIBS ON - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - self.free() - def __init__(self, path_llama_h: str | None = None, path_includes: list[str] = [], path_libllama: str | None = None): path_llama_h = path_llama_h or self.DEFAULT_PATH_LLAMA_H path_includes = path_includes or self.DEFAULT_PATH_INCLUDES @@ -495,6 +490,7 @@ def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLl except Exception as e: logger.exception(f"An error occurred during tokenizer comparison: {e}") + def main(argv: list[str] | None = None): parser = argparse.ArgumentParser() parser.add_argument("vocab_file", type=str, help="path to vocab 'gguf' file") From bc8648fbbeb8821b23bc06d9e09c3d269482b021 Mon Sep 17 00:00:00 2001 From: Robert Date: Sat, 16 Nov 2024 21:01:38 -0800 Subject: [PATCH 09/10] Update test-tokenizer-random.py Updated `find_first_mismatch` from suggestion by jaime-m-p regarding incorrect checking. --- tests/test-tokenizer-random.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index 2bc14e23f..93da1d21f 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -423,8 +423,11 @@ def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLl # return -1 # return min(len(ids1), len(ids2)) # Rewritten to use zip() and next() instead of for loop - def find_first_mismatch(ids1: Sequence[Any], ids2: Sequence[Any]) -> int: - return next((i for i, (a, b) in enumerate(zip(ids1, ids2)) if a != b), -1) + def find_first_mismatch(ids1, ids2) -> int: + index = next((i for i, (a, b) in enumerate(zip(ids1, ids2)) if a != b), -1) + if index < 0 and len(ids1) != len(ids2): + index = min(len(ids1), len(ids2)) + return index def check_detokenizer(text: str, text1: str, text2: str) -> bool: if text1 == text2: # equal to TokenizerGroundtruth? From 883dc22d4475c07079abe33bc1d17d303f83e352 Mon Sep 17 00:00:00 2001 From: Robert Date: Sun, 17 Nov 2024 08:35:07 -0800 Subject: [PATCH 10/10] Update test-tokenizer-random.py Re-added type declarations --- tests/test-tokenizer-random.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index 93da1d21f..7d84ff554 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -423,7 +423,7 @@ def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLl # return -1 # return min(len(ids1), len(ids2)) # Rewritten to use zip() and next() instead of for loop - def find_first_mismatch(ids1, ids2) -> int: + def find_first_mismatch(ids1: Sequence[Any], ids2: Sequence[Any]) -> int: index = next((i for i, (a, b) in enumerate(zip(ids1, ids2)) if a != b), -1) if index < 0 and len(ids1) != len(ids2): index = min(len(ids1), len(ids2))