From 3eb1900e5cc362ebbfcfad940da67eb6a5eaa042 Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Wed, 10 Jul 2024 00:46:19 +0200 Subject: [PATCH] Skip literal UNUSED token checks --- tests/test-tokenizer-random.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index e7fed3fa3..ee79d7c27 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -533,10 +533,20 @@ def compare_vocabs(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLlamaC if vocab1 != vocab2: num_errors = 0 for i in range(max(len(vocab1), len(vocab2))): - text1 = vocab1[i] if i < len(vocab1) else "" - text2 = vocab2[i] if i < len(vocab2) else "" - is_unused = text1.startswith("[UNUSED_TOKEN_") # AutoTokenizer adds more unused tokens than SentencePiece ? - if text1 != text2 and is_unused and text2: + text1 = vocab1[i] if i < len(vocab1) else None + text2 = vocab2[i] if i < len(vocab2) else None + if True: #WIP: SentencePiece adds more unused tokens than AutoTokenizer ? + if text1 is None: + if not text2 or text2.startswith('[PAD'): # is unused ? #TODO: use toktypes + text2 = None + else: + #TODO: is "UNUSED_TOKEN_" valid for all models ? + text1 = text1.replace("[UNUSED_TOKEN_", "[PAD") + #if text1 is None or text1.startswith("[UNUSED_TOKEN_"): # is unused ? + # text1 = "" + #if text2 is None or text2.startswith('[PAD'): # is unused ? + # text2 = "" + if text1 != text2: num_errors += 1 if num_errors < MAX_PRINT_ERRORS: logger.error(f" {detokenize=} id={i} expected={repr(text1)} result={repr(text2)}")