Tokenizer WPM fixes (#7500)

* Update random test: add_bos_token.
* Update random test: add WPM models for testing.
* Build vocab.special_tokens_cache using vocab token types.
* Fix and improve WPM preprocessing.
  - Fix unicode edge case combinations.
  - Split by whitspace in the same pass.
* Discard all tokens when no matching found.
This commit is contained in:
jaime-m-p 2024-05-28 21:46:34 +02:00 committed by GitHub
parent 6bd12ce409
commit 02c1ecad07
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 75 additions and 167 deletions

View file

@ -167,8 +167,10 @@ def generator_random_special_tokens(tokenizer, iterations=100) -> Iterator[str]:
for m in range(iterations):
rand.seed(m)
words = rand.choices(special_tokens, k=500)
if tokenizer.add_bos_token: # skip spam warning of double BOS
while words and words[0] == tokenizer.bos_token:
if words[0] == tokenizer.bos_token: # skip spam warning of double BOS
while len(words) > 1 and words[1] == tokenizer.bos_token: # leave one starting BOS
words.pop(0)
if tokenizer.add_bos_token: # drop all starting BOS
words.pop(0)
yield "".join(words)
@ -293,15 +295,17 @@ def main(argv: list[str] = None):
model = LibLlamaModel(LibLlama(), args.vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=4096))
tokenizer = AutoTokenizer.from_pretrained(args.dir_tokenizer)
tokenizer.add_bos_token = getattr(tokenizer, "add_bos_token", True)
tokenizer.add_eos_token = getattr(tokenizer, "add_eos_token", False)
def func_tokenize1(text: str):
return model.tokenize(text, add_special=True, parse_special=True)
def func_tokenize2(text: str):
return tokenizer.encode(text, add_special_tokens=True)
ids = func_tokenize2("a")
assert 1 <= len(ids) <= 3
add_bos_token = len(ids) > 1 and tokenizer.bos_token_id == ids[0]
tokenizer.add_bos_token = getattr(tokenizer, "add_bos_token", add_bos_token)
vocab = list(sorted(tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True)))
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text())
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases())
@ -324,8 +328,10 @@ if __name__ == "__main__":
# import os
# tokenizers = os.listdir(path_tokenizers)
tokenizers = [
"llama-spm", # SPM
"phi-3", # SPM
# "llama-spm", # SPM
# "phi-3", # SPM
"jina-v2-en", # WPM
"bert-bge", # WPM
]
for tokenizer in tokenizers: