fix func call tokens for internlm2

This commit is contained in:
RunningLeon 2024-07-16 17:03:29 +08:00
parent 7acfd4e8d5
commit 7b575e70f5

View file

@ -2213,7 +2213,7 @@ class InternLM2Model(Model):
chat_eos_token = '<|im_end|>' chat_eos_token = '<|im_end|>'
chat_eos_token_id = None chat_eos_token_id = None
func_call_tokens =('<|plugin|>', '<|interpreter|>', '<|action_end|>', '<|action_start|>')
tokenizer_config_file = self.dir_model / 'tokenizer_config.json' tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
if tokenizer_config_file.is_file(): if tokenizer_config_file.is_file():
with open(tokenizer_config_file, "r", encoding="utf-8") as f: with open(tokenizer_config_file, "r", encoding="utf-8") as f:
@ -2230,7 +2230,7 @@ class InternLM2Model(Model):
tokens[token_id] = token tokens[token_id] = token
scores[token_id] = -1000.0 scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
if foken_data.get("special"): if foken_data.get("special") and not foken_data["content"] in func_call_tokens:
toktypes[token_id] = SentencePieceTokenTypes.CONTROL toktypes[token_id] = SentencePieceTokenTypes.CONTROL
tokenizer_file = self.dir_model / 'tokenizer.json' tokenizer_file = self.dir_model / 'tokenizer.json'
@ -2249,7 +2249,7 @@ class InternLM2Model(Model):
tokens[token_id] = token tokens[token_id] = token
scores[token_id] = -1000.0 scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
if foken_data.get("special"): if foken_data.get("special") and not foken_data["content"] in func_call_tokens:
toktypes[token_id] = SentencePieceTokenTypes.CONTROL toktypes[token_id] = SentencePieceTokenTypes.CONTROL
self.gguf_writer.add_tokenizer_model("llama") self.gguf_writer.add_tokenizer_model("llama")