diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index ff39d3353..8f22fb530 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2439,12 +2439,15 @@ class GemmaModel(Model): def set_vocab(self): self._set_vocab_sentencepiece() + + # TODO: these special tokens should be exported only for the CodeGemma family special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False, - special_token_types = ['prefix', 'suffix', 'middle', 'eot']) + special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot']) special_vocab._set_special_token("prefix", 67) special_vocab._set_special_token("suffix", 69) special_vocab._set_special_token("middle", 68) - special_vocab._set_special_token("eot", 70) + special_vocab._set_special_token("fsep", 70) + special_vocab._set_special_token("eot", 107) special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): diff --git a/llama.cpp b/llama.cpp index 92b222392..09d25aaba 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2104,7 +2104,7 @@ struct llama_vocab { id special_prefix_id = -1; id special_suffix_id = -1; id special_middle_id = -1; - id special_eot_id = -1; + id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token bool add_space_prefix = true; @@ -4151,7 +4151,10 @@ static void llm_load_vocab( vocab.special_prefix_id = 67; vocab.special_suffix_id = 69; vocab.special_middle_id = 68; - vocab.special_eot_id = 70; + // TODO: this is not EOT, it is "file separator" token, needs fix + // https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572 + //vocab.special_eot_id = 70; + vocab.special_eot_id = 107; } }