From 067ef868e95167d545101a96ed2f72a42116e0cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Romain=20=E2=80=9CArtefact2=E2=80=9D=20Dal=20Maso?=
 <romain.dalmaso@artefact2.com>
Date: Mon, 22 Jan 2024 19:14:26 +0100
Subject: [PATCH] convert : fix byte tokens for --vocab-type hfft

This is inspired by 9f297f81adb93fdfefbd6496ff50cdfe070bc775, which got lost
during the refactoring in 6efb8eb30e7025b168f3fda3ff83b9b386428ad6.
---
 convert.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/convert.py b/convert.py
index 06768033d..f6e8520b2 100755
--- a/convert.py
+++ b/convert.py
@@ -509,11 +509,13 @@ class HfVocab:
 
             # Convert token text to bytes
             token_text = reverse_vocab[token_id].encode("utf-8")
+            if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
+                toktype = gguf.TokenType.BYTE
+            else:
+                toktype = self.get_token_type(token_id, self.special_ids)
 
             # Yield token text, score, and type
-            yield token_text, self.get_token_score(token_id), self.get_token_type(
-                token_id, self.special_ids  # Reuse already stored special IDs
-            )
+            yield token_text, self.get_token_score(token_id), toktype
 
     def get_token_type(self, token_id: int, special_ids: set[int]) -> gguf.TokenType:
         # Determine token type based on whether it's a special token