Support Llama 3 conversion

The tokenizer is BPE.
2024-04-18 18:38:05 +02:00 · 2024-04-18 18:38:05 +02:00 · d79ab101c3
commit d79ab101c3
parent 0d56246f4b
2 changed files with 22 additions and 9 deletions
--- a/convert.py
+++ b/convert.py
@ -525,7 +525,14 @@ class LlamaHfVocab(Vocab):

        # pre-check so we know if we need transformers
        tokenizer_model: dict[str, Any] = tokenizer_json['model']
-        if (
+        is_llama3 = (
+            tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
+            and not tokenizer_model.get('byte_fallback', True)
+        )
+        if is_llama3:
+            raise TypeError('Llama 3 must be converted with BpeVocab')
+
+        if not is_llama3 and (
            tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
            or tokenizer_json['decoder']['type'] != 'Sequence'
        ):