Tokenizer WPM fixes (#7500)
* Update random test: add_bos_token. * Update random test: add WPM models for testing. * Build vocab.special_tokens_cache using vocab token types. * Fix and improve WPM preprocessing. - Fix unicode edge case combinations. - Split by whitspace in the same pass. * Discard all tokens when no matching found.
This commit is contained in:
parent
6bd12ce409
commit
02c1ecad07
2 changed files with 75 additions and 167 deletions
|
@ -167,8 +167,10 @@ def generator_random_special_tokens(tokenizer, iterations=100) -> Iterator[str]:
|
|||
for m in range(iterations):
|
||||
rand.seed(m)
|
||||
words = rand.choices(special_tokens, k=500)
|
||||
if tokenizer.add_bos_token: # skip spam warning of double BOS
|
||||
while words and words[0] == tokenizer.bos_token:
|
||||
if words[0] == tokenizer.bos_token: # skip spam warning of double BOS
|
||||
while len(words) > 1 and words[1] == tokenizer.bos_token: # leave one starting BOS
|
||||
words.pop(0)
|
||||
if tokenizer.add_bos_token: # drop all starting BOS
|
||||
words.pop(0)
|
||||
yield "".join(words)
|
||||
|
||||
|
@ -293,15 +295,17 @@ def main(argv: list[str] = None):
|
|||
model = LibLlamaModel(LibLlama(), args.vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=4096))
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.dir_tokenizer)
|
||||
|
||||
tokenizer.add_bos_token = getattr(tokenizer, "add_bos_token", True)
|
||||
tokenizer.add_eos_token = getattr(tokenizer, "add_eos_token", False)
|
||||
|
||||
def func_tokenize1(text: str):
|
||||
return model.tokenize(text, add_special=True, parse_special=True)
|
||||
|
||||
def func_tokenize2(text: str):
|
||||
return tokenizer.encode(text, add_special_tokens=True)
|
||||
|
||||
ids = func_tokenize2("a")
|
||||
assert 1 <= len(ids) <= 3
|
||||
add_bos_token = len(ids) > 1 and tokenizer.bos_token_id == ids[0]
|
||||
tokenizer.add_bos_token = getattr(tokenizer, "add_bos_token", add_bos_token)
|
||||
|
||||
vocab = list(sorted(tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True)))
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text())
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases())
|
||||
|
@ -324,8 +328,10 @@ if __name__ == "__main__":
|
|||
# import os
|
||||
# tokenizers = os.listdir(path_tokenizers)
|
||||
tokenizers = [
|
||||
"llama-spm", # SPM
|
||||
"phi-3", # SPM
|
||||
# "llama-spm", # SPM
|
||||
# "phi-3", # SPM
|
||||
"jina-v2-en", # WPM
|
||||
"bert-bge", # WPM
|
||||
]
|
||||
|
||||
for tokenizer in tokenizers:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue