From 55e387b2d55e5704c65575b298b7d479b145687d Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Sat, 25 May 2024 00:19:31 +0200 Subject: [PATCH] Add BPE models for testing --- tests/test-tokenizer-random.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index b3ec4d0da..f69038c87 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -156,6 +156,7 @@ def generator_custom_text_edge_cases() -> Iterator[str]: 'a', # Phi-3 fail '<|endoftext|>', # Phi-3 fail 'a\na', # TODO: Bert fail + 'a\xa0\xa0\x00b', # jina-v2-es ] @@ -328,8 +329,15 @@ if __name__ == "__main__": # import os # tokenizers = os.listdir(path_tokenizers) tokenizers = [ - "llama-spm", # SPM - "phi-3", # SPM + # "llama-spm", # SPM + # "phi-3", # SPM + # "bert-bge", # WPM + # "jina-v2-en", # WPM + "gpt-2", # BPE + "llama-bpe", # BPE + "jina-v2-es", # BPE + "jina-v2-de", # BPE + "phi-2", # BPE ] for tokenizer in tokenizers: