Update bruteforce test: fix binary search

This commit is contained in:
jaime-m-p 2024-08-07 23:08:04 +02:00
parent 2ca313830e
commit 80f41234e4

View file

@ -513,14 +513,16 @@ def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLl
a, b = 0, len(text) a, b = 0, len(text)
step = b step = b
while step > 1: while step > 1:
step = step // 2 step = (step + 1) // 2
if not _compare(text[a : b - step])[0]: t = max(a, b - step)
b = b - step if not _compare(text[a : t])[0]:
b = t
step = b step = b
while step > 1: while step > 1:
step = step // 2 step = (step + 1) // 2
if not _compare(text[a + step : b])[0]: t = min(a + step, b)
a = a + step if not _compare(text[t : b])[0]:
a = t
ok, ids1, ids2, text1, text2 = _compare(text[a : b]) ok, ids1, ids2, text1, text2 = _compare(text[a : b])
assert a <= b and not ok assert a <= b and not ok
# show unique failing texts differences # show unique failing texts differences