diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index f6afe0cd0..dc41cf199 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -167,13 +167,13 @@ def generator_random_chars(iterations = 100) -> Iterator[str]: """Brute force random text with simple characters""" WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5) - CHARS = list(set(""" + CHARS = list(sorted(set(""" ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz ÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÄËÏÖÜ áéíóúàèìòùâêîôûäëïöü .-,*/-+ª!"·$%&/()=?¿[]{}<>\\|@#~½¬~;:_ - """)) + """))) rand = random.Random() for m in range(iterations): @@ -194,7 +194,7 @@ def generator_random_vocab_chars(vocab: list[str], iterations = 100) -> Iterator vocab_chars = set() for word in vocab: vocab_chars.update(word) - vocab_chars = list(vocab_chars) + vocab_chars = list(sorted(vocab_chars)) rand = random.Random() for m in range(iterations): @@ -260,7 +260,7 @@ def test_compare_tokenizer(func_tokenize1: Callable, func_tokenize2: Callable, g ids1 = list(ids1)[max(0, i - 2) : i + 2 + 1] ids2 = list(ids2)[max(0, i - 2) : i + 2 + 1] text2 = tokenizer.decode(ids2, skip_special_tokens=True) - assert (text2 in text) + #assert (text2 in text) logger.info(" Text: " + repr(text2)) logger.info(" TokenIDs: " + str(ids1)) logger.info(" Expected: " + str(ids2)) @@ -288,7 +288,7 @@ if __name__ == "__main__": def func_tokenize1(text:str): return model.tokenize(text, add_special=False, parse_special=parse_special) - vocab = tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True) + vocab = list(sorted(tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True))) test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text()) test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases()) test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_vocab_words(vocab))