From 067ef868e95167d545101a96ed2f72a42116e0cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Romain=20=E2=80=9CArtefact2=E2=80=9D=20Dal=20Maso?= Date: Mon, 22 Jan 2024 19:14:26 +0100 Subject: [PATCH] convert : fix byte tokens for --vocab-type hfft This is inspired by 9f297f81adb93fdfefbd6496ff50cdfe070bc775, which got lost during the refactoring in 6efb8eb30e7025b168f3fda3ff83b9b386428ad6. --- convert.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/convert.py b/convert.py index 06768033d..f6e8520b2 100755 --- a/convert.py +++ b/convert.py @@ -509,11 +509,13 @@ class HfVocab: # Convert token text to bytes token_text = reverse_vocab[token_id].encode("utf-8") + if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text): + toktype = gguf.TokenType.BYTE + else: + toktype = self.get_token_type(token_id, self.special_ids) # Yield token text, score, and type - yield token_text, self.get_token_score(token_id), self.get_token_type( - token_id, self.special_ids # Reuse already stored special IDs - ) + yield token_text, self.get_token_score(token_id), toktype def get_token_type(self, token_id: int, special_ids: set[int]) -> gguf.TokenType: # Determine token type based on whether it's a special token