llama : fix codegemma EOT token + add TODOs
This commit is contained in:
parent
7ab0939c0d
commit
23b8dd7dd4
2 changed files with 10 additions and 4 deletions
|
@ -2439,12 +2439,15 @@ class GemmaModel(Model):
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
self._set_vocab_sentencepiece()
|
self._set_vocab_sentencepiece()
|
||||||
|
|
||||||
|
# TODO: these special tokens should be exported only for the CodeGemma family
|
||||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
|
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
|
||||||
special_token_types = ['prefix', 'suffix', 'middle', 'eot'])
|
special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot'])
|
||||||
special_vocab._set_special_token("prefix", 67)
|
special_vocab._set_special_token("prefix", 67)
|
||||||
special_vocab._set_special_token("suffix", 69)
|
special_vocab._set_special_token("suffix", 69)
|
||||||
special_vocab._set_special_token("middle", 68)
|
special_vocab._set_special_token("middle", 68)
|
||||||
special_vocab._set_special_token("eot", 70)
|
special_vocab._set_special_token("fsep", 70)
|
||||||
|
special_vocab._set_special_token("eot", 107)
|
||||||
special_vocab.add_to_gguf(self.gguf_writer)
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
|
|
|
@ -2104,7 +2104,7 @@ struct llama_vocab {
|
||||||
id special_prefix_id = -1;
|
id special_prefix_id = -1;
|
||||||
id special_suffix_id = -1;
|
id special_suffix_id = -1;
|
||||||
id special_middle_id = -1;
|
id special_middle_id = -1;
|
||||||
id special_eot_id = -1;
|
id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
|
||||||
|
|
||||||
bool add_space_prefix = true;
|
bool add_space_prefix = true;
|
||||||
|
|
||||||
|
@ -4151,7 +4151,10 @@ static void llm_load_vocab(
|
||||||
vocab.special_prefix_id = 67;
|
vocab.special_prefix_id = 67;
|
||||||
vocab.special_suffix_id = 69;
|
vocab.special_suffix_id = 69;
|
||||||
vocab.special_middle_id = 68;
|
vocab.special_middle_id = 68;
|
||||||
vocab.special_eot_id = 70;
|
// TODO: this is not EOT, it is "file separator" token, needs fix
|
||||||
|
// https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
|
||||||
|
//vocab.special_eot_id = 70;
|
||||||
|
vocab.special_eot_id = 107;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue