From 80f41234e40d4368960c85d7383c7b6a70cb5eac Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Wed, 7 Aug 2024 23:08:04 +0200 Subject: [PATCH] Update bruteforce test: fix binary search --- tests/test-tokenizer-random.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index 4a5773fa5..f7c3b1407 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -513,14 +513,16 @@ def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLl a, b = 0, len(text) step = b while step > 1: - step = step // 2 - if not _compare(text[a : b - step])[0]: - b = b - step + step = (step + 1) // 2 + t = max(a, b - step) + if not _compare(text[a : t])[0]: + b = t step = b while step > 1: - step = step // 2 - if not _compare(text[a + step : b])[0]: - a = a + step + step = (step + 1) // 2 + t = min(a + step, b) + if not _compare(text[t : b])[0]: + a = t ok, ids1, ids2, text1, text2 = _compare(text[a : b]) assert a <= b and not ok # show unique failing texts differences