convert : fix byte tokens for --vocab-type hfft

This is inspired by 9f297f81ad, which got lost during the refactoring in 6efb8eb30e.
2024-01-22 19:14:26 +01:00 · 2024-01-22 19:14:26 +01:00 · 067ef868e9
commit 067ef868e9
parent 011e8ec577
1 changed files with 5 additions and 3 deletions
--- a/convert.py
+++ b/convert.py
@ -509,11 +509,13 @@ class HfVocab:

            # Convert token text to bytes
            token_text = reverse_vocab[token_id].encode("utf-8")
+            if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
+                toktype = gguf.TokenType.BYTE
+            else:
+                toktype = self.get_token_type(token_id, self.special_ids)

            # Yield token text, score, and type
-            yield token_text, self.get_token_score(token_id), self.get_token_type(
-                token_id, self.special_ids  # Reuse already stored special IDs
-            )
+            yield token_text, self.get_token_score(token_id), toktype

    def get_token_type(self, token_id: int, special_ids: set[int]) -> gguf.TokenType:
        # Determine token type based on whether it's a special token