llama : fix pre-tokenization of non-special added tokens (#8228)

* llama : fix mpt and olmo pre-tokenizer * llama : pre-tokenize non-special user-defined tokens first * llama : fix detection of control-like user-defined tokens * convert_hf : identify which user-defined tokens are control tokens Only used in _set_vocab_gpt2() for now. * convert_hf : identify more added control tokens for SPM tokenziers This makes Gemma and Gemma-2 tokenize pretty much EVERYTHING correctly, including HTML tags and consecutive spaces, but it unfortunately requires model re-conversion. There seems to be a weird behavior of the HF tokenizer for Gemma, which prefers to use the 16-space token over more lengthy space tokens, while using the SentencePiece tokenizer does not do this. (the implementation in llama.cpp has the same behavior as SentencePiece) * llama : fix wrong pre-tokenization of byte tokens * llama : fix Viking pre-tokenizer regex The order was previously wrong, which caused errors in some tests. * llama : fix command-r detokenization * convert_hf : reduce usages of the UNKNOWN token type * llama : add UNKNOWN tokens in the special tokens cache * convert_hf : reduce usages of UNKNOWN for InternLM2 This makes the changes from #8321 more consistent with the other changes made here. * test-tokenizer-random : reduce potential confilcts with #8379 * test-tokenizer-random : add a failing edge case for falcon
2024-07-13 23:35:10 -04:00 · 2024-07-13 23:35:10 -04:00 · fa79495bb4
commit fa79495bb4
parent 17eb6aa8a9
4 changed files with 91 additions and 61 deletions
--- a/tests/test-tokenizer-random.py
+++ b/tests/test-tokenizer-random.py
@ -20,7 +20,7 @@ from typing import Any, Iterator, cast
 from typing_extensions import Buffer

 import cffi
-from transformers import AutoTokenizer
+from transformers import AutoTokenizer, PreTrainedTokenizer


 logger = logging.getLogger("test-tokenizer-random")
@ -129,7 +129,7 @@ class Tokenizer:
 class TokenizerGroundtruth (Tokenizer):

    def __init__(self, dir_tokenizer: str):
-        self.model = AutoTokenizer.from_pretrained(dir_tokenizer)
+        self.model: PreTrainedTokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
        # guess BOS and EOS
        ids = self.encode("a")
        assert 1 <= len(ids) <= 3
@ -143,7 +143,7 @@ class TokenizerGroundtruth (Tokenizer):
        self.vocab = list(sorted(self.vocab))
        # tokens and lists
        self.special_tokens = list(self.model.all_special_tokens)
-        self.added_tokens   = list(self.model.added_tokens_encoder)
+        self.added_tokens   = self.model.batch_decode(self.model.added_tokens_encoder.values(), skip_special_tokens=False)
        self.bos_token = self.model.bos_token
        self.eos_token = self.model.eos_token

@ -232,6 +232,7 @@ def generator_custom_text_edge_cases() -> Iterator[str]:
        'a\na',            # bert fail
        '"`',              # falcon
        ' \u2e4e',         # falcon
+        '\n\x0b  ',        # falcon
        'a\xa0\xa0\x00b',  # jina-v2-es
        'one <mask>',      # jina-v2-es  <mask> lstrip=true
        'a </s> b',        # rstrip phi-3