convert : fix byte tokens for --vocab-type hfft
This is inspired by9f297f81ad
, which got lost during the refactoring in6efb8eb30e
.
This commit is contained in:
parent
011e8ec577
commit
067ef868e9
1 changed files with 5 additions and 3 deletions
|
@ -509,11 +509,13 @@ class HfVocab:
|
|||
|
||||
# Convert token text to bytes
|
||||
token_text = reverse_vocab[token_id].encode("utf-8")
|
||||
if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
|
||||
toktype = gguf.TokenType.BYTE
|
||||
else:
|
||||
toktype = self.get_token_type(token_id, self.special_ids)
|
||||
|
||||
# Yield token text, score, and type
|
||||
yield token_text, self.get_token_score(token_id), self.get_token_type(
|
||||
token_id, self.special_ids # Reuse already stored special IDs
|
||||
)
|
||||
yield token_text, self.get_token_score(token_id), toktype
|
||||
|
||||
def get_token_type(self, token_id: int, special_ids: set[int]) -> gguf.TokenType:
|
||||
# Determine token type based on whether it's a special token
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue