Compare commits

...
Sign in to create a new pull request.

2 commits

Author SHA1 Message Date
Francis Couture-Harpin
50d1a035f0 convert_hf : fix Gemma v1 not setting BOS and EOS tokens 2024-07-19 22:46:35 -04:00
Francis Couture-Harpin
5a9cb57494 convert_hf : fix Gemma v1 conversion
* convert_hf : allow renaming tokens, but with a warning
2024-07-19 16:58:05 -04:00

View file

@ -750,7 +750,8 @@ class Model:
token_id = int(token_id) token_id = int(token_id)
token: str = token_data["content"] token: str = token_data["content"]
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
assert tokens[token_id] == token.encode("utf-8") if tokens[token_id] != token.encode("utf-8"):
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
if token_data.get("special") or self.does_token_look_special(token): if token_data.get("special") or self.does_token_look_special(token):
toktypes[token_id] = SentencePieceTokenTypes.CONTROL toktypes[token_id] = SentencePieceTokenTypes.CONTROL
else: else:
@ -1309,6 +1310,7 @@ class RefactModel(Model):
special_vocab._set_special_token("prefix", 1) special_vocab._set_special_token("prefix", 1)
special_vocab._set_special_token("suffix", 3) special_vocab._set_special_token("suffix", 3)
special_vocab._set_special_token("middle", 2) special_vocab._set_special_token("middle", 2)
special_vocab.chat_template = None # do not add it twice
special_vocab.add_to_gguf(self.gguf_writer) special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self): def set_gguf_parameters(self):
@ -2011,7 +2013,8 @@ class Phi3MiniModel(Model):
token_id = int(token_id) token_id = int(token_id)
token = foken_data["content"].encode("utf-8") token = foken_data["content"].encode("utf-8")
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
assert tokens[token_id] == token if tokens[token_id] != token:
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
tokens[token_id] = token tokens[token_id] = token
scores[token_id] = -1000.0 scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@ -2027,7 +2030,8 @@ class Phi3MiniModel(Model):
token_id = int(foken_data["id"]) token_id = int(foken_data["id"])
token = foken_data["content"].encode("utf-8") token = foken_data["content"].encode("utf-8")
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
assert tokens[token_id] == token if tokens[token_id] != token:
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
tokens[token_id] = token tokens[token_id] = token
scores[token_id] = -1000.0 scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@ -2266,7 +2270,8 @@ class InternLM2Model(Model):
chat_eos_token_id = token_id chat_eos_token_id = token_id
token = token.encode("utf-8") token = token.encode("utf-8")
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
assert(tokens[token_id] == token) if tokens[token_id] != token:
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
tokens[token_id] = token tokens[token_id] = token
scores[token_id] = -1000.0 scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@ -2285,7 +2290,8 @@ class InternLM2Model(Model):
chat_eos_token_id = token_id chat_eos_token_id = token_id
token = token.encode("utf-8") token = token.encode("utf-8")
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
assert(tokens[token_id] == token) if tokens[token_id] != token:
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
tokens[token_id] = token tokens[token_id] = token
scores[token_id] = -1000.0 scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@ -2471,6 +2477,7 @@ class GemmaModel(Model):
special_vocab._set_special_token("middle", 68) special_vocab._set_special_token("middle", 68)
special_vocab._set_special_token("fsep", 70) special_vocab._set_special_token("fsep", 70)
special_vocab._set_special_token("eot", 107) special_vocab._set_special_token("eot", 107)
special_vocab.chat_template = None # do not add it twice
special_vocab.add_to_gguf(self.gguf_writer) special_vocab.add_to_gguf(self.gguf_writer)
self.gguf_writer.add_add_space_prefix(False) self.gguf_writer.add_add_space_prefix(False)