From ded2ad5b88a2ae21a47ce929480467b9595c4b67 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 5 Feb 2024 13:42:54 +0200 Subject: [PATCH 1/2] py : handle byte tokens in `get_token_type` --- convert.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/convert.py b/convert.py index 75c100118..a89bbc0ac 100755 --- a/convert.py +++ b/convert.py @@ -515,10 +515,14 @@ class HfVocab: # Yield token text, score, and type yield token_text, self.get_token_score(token_id), self.get_token_type( - token_id, self.special_ids # Reuse already stored special IDs + token_id, token_text, self.special_ids # Reuse already stored special IDs ) - def get_token_type(self, token_id: int, special_ids: set[int]) -> gguf.TokenType: + def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType: + # Special case for byte tokens + if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text): + return gguf.TokenType.BYTE + # Determine token type based on whether it's a special token return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL @@ -530,7 +534,7 @@ class HfVocab: def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: for text in self.added_tokens_list: if text in self.specials: - toktype = self.get_token_type(self.specials[text], self.special_ids) + toktype = self.get_token_type(self.specials[text], [], self.special_ids) score = self.get_token_score(self.specials[text]) else: toktype = gguf.TokenType.USER_DEFINED From adcf16fd685c6c9d336868667dd509592934fe2b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 5 Feb 2024 19:53:07 +0200 Subject: [PATCH 2/2] py : fix empty bytes arg --- convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert.py b/convert.py index a89bbc0ac..4a2847a27 100755 --- a/convert.py +++ b/convert.py @@ -534,7 +534,7 @@ class HfVocab: def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: for text in self.added_tokens_list: if text in self.specials: - toktype = self.get_token_type(self.specials[text], [], self.special_ids) + toktype = self.get_token_type(self.specials[text], b'', self.special_ids) score = self.get_token_score(self.specials[text]) else: toktype = gguf.TokenType.USER_DEFINED