llama : fix codegemma EOT token + add TODOs

This commit is contained in:
Georgi Gerganov 2024-04-21 11:15:45 +03:00
parent 7ab0939c0d
commit 23b8dd7dd4
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
2 changed files with 10 additions and 4 deletions

View file

@ -2439,12 +2439,15 @@ class GemmaModel(Model):
def set_vocab(self):
self._set_vocab_sentencepiece()
# TODO: these special tokens should be exported only for the CodeGemma family
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
special_token_types = ['prefix', 'suffix', 'middle', 'eot'])
special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot'])
special_vocab._set_special_token("prefix", 67)
special_vocab._set_special_token("suffix", 69)
special_vocab._set_special_token("middle", 68)
special_vocab._set_special_token("eot", 70)
special_vocab._set_special_token("fsep", 70)
special_vocab._set_special_token("eot", 107)
special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self):

View file

@ -2104,7 +2104,7 @@ struct llama_vocab {
id special_prefix_id = -1;
id special_suffix_id = -1;
id special_middle_id = -1;
id special_eot_id = -1;
id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
bool add_space_prefix = true;
@ -4151,7 +4151,10 @@ static void llm_load_vocab(
vocab.special_prefix_id = 67;
vocab.special_suffix_id = 69;
vocab.special_middle_id = 68;
vocab.special_eot_id = 70;
// TODO: this is not EOT, it is "file separator" token, needs fix
// https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
//vocab.special_eot_id = 70;
vocab.special_eot_id = 107;
}
}