From 1caa20fc7a4bd0eac1cc26e5c7262c3dadeaf952 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Wed, 10 Jul 2024 17:33:04 -0400 Subject: [PATCH] convert_hf : reduce usages of UNKNOWN for InternLM2 This makes the changes from #8321 more consistent with the other changes made here. --- convert_hf_to_gguf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 0236166b3..c15c126eb 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2189,7 +2189,7 @@ class InternLM2Model(Model): toktype = SentencePieceTokenTypes.BYTE # take care of ununsed raw token if piece.startswith('[UNUSED'): - toktype = SentencePieceTokenTypes.UNKNOWN + toktype = SentencePieceTokenTypes.UNUSED tokens.append(text) scores.append(score) @@ -2219,7 +2219,7 @@ class InternLM2Model(Model): if token == chat_eos_token: chat_eos_token_id = token_id token = token.encode("utf-8") - if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN: + if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: assert(tokens[token_id] == token) tokens[token_id] = token scores[token_id] = -1000.0 @@ -2238,7 +2238,7 @@ class InternLM2Model(Model): if token == chat_eos_token: chat_eos_token_id = token_id token = token.encode("utf-8") - if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN: + if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: assert(tokens[token_id] == token) tokens[token_id] = token scores[token_id] = -1000.0