add comment to glm prefix and suffix

This commit is contained in:
toyer 2024-06-27 02:55:52 +00:00
parent 0595f03dd1
commit 7357273e08
4 changed files with 13 additions and 13 deletions

View file

@ -3043,12 +3043,12 @@ class ChatGLMModel(Model):
self.gguf_writer.add_tokenizer_pre(tokpre) self.gguf_writer.add_tokenizer_pre(tokpre)
self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes) self.gguf_writer.add_token_types(toktypes)
self.gguf_writer.add_add_bos_token(False)
special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
special_vocab.chat_template = "chatglm4" special_vocab.chat_template = "chatglm4"
special_vocab.merges = merges special_vocab.merges = merges
# only add special tokens when they were not already loaded from config.json # only add special tokens when they were not already loaded from config.json
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"])
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
# this one is usually not in config.json anyway # this one is usually not in config.json anyway

View file

@ -104,7 +104,7 @@ class Keys:
ADD_BOS = "tokenizer.ggml.add_bos_token" ADD_BOS = "tokenizer.ggml.add_bos_token"
ADD_EOS = "tokenizer.ggml.add_eos_token" ADD_EOS = "tokenizer.ggml.add_eos_token"
ADD_PREFIX = "tokenizer.ggml.add_space_prefix" ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
REMOVE_EXTRA_WS = "tokenizer.ggml.remove_extra_whitespaces" REMOVE_EXTRA_WS = "tokenizer.ggml.remove_extra_whitespaces"
PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap" PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap"
HF_JSON = "tokenizer.huggingface.json" HF_JSON = "tokenizer.huggingface.json"
RWKV = "tokenizer.rwkv.world" RWKV = "tokenizer.rwkv.world"

View file

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "gguf" name = "gguf"
version = "0.9.1" version = "0.9.0"
description = "Read and write ML models in GGUF for GGML" description = "Read and write ML models in GGUF for GGML"
authors = ["GGML <ggml@ggml.ai>"] authors = ["GGML <ggml@ggml.ai>"]
packages = [ packages = [

View file

@ -14745,10 +14745,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
} }
// add prefix to chatglm3 // add prefix to chatglm3
if (vocab.type_pre == LLAMA_VOCAB_PRE_TYPE_CHATGLM3) { if (vocab.type_pre == LLAMA_VOCAB_PRE_TYPE_CHATGLM3) {
output.push_back(64790); output.push_back(64790); // [gMask]
output.push_back(64792); output.push_back(64792); // sop
output.push_back(64795); output.push_back(64795); // <|user|>
output.push_back(30910); output.push_back(30910); // \n
output.push_back(13); output.push_back(13);
} }
@ -14787,7 +14787,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
} }
// add suffix to chatglm3 // add suffix to chatglm3
if (vocab.type_pre == LLAMA_VOCAB_PRE_TYPE_CHATGLM3) { if (vocab.type_pre == LLAMA_VOCAB_PRE_TYPE_CHATGLM3) {
output.push_back(64796); output.push_back(64796); // <|assistant|>
} }
} break; } break;
case LLAMA_VOCAB_TYPE_BPE: case LLAMA_VOCAB_TYPE_BPE:
@ -14799,10 +14799,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
} }
// add prefix to chatglm4 // add prefix to chatglm4
if (vocab.type_pre == LLAMA_VOCAB_PRE_TYPE_CHATGLM4) { if (vocab.type_pre == LLAMA_VOCAB_PRE_TYPE_CHATGLM4) {
output.push_back(151331); output.push_back(151331); // [gMASK]
output.push_back(151333); output.push_back(151333); // <sop>
output.push_back(151336); output.push_back(151336); // <|user|>
output.push_back(198); output.push_back(198); // \n
} }
for (const auto & fragment : fragment_buffer) { for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
@ -14823,7 +14823,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
} }
// add suffix to chatglm4 // add suffix to chatglm4
if (vocab.type_pre == LLAMA_VOCAB_PRE_TYPE_CHATGLM4) { if (vocab.type_pre == LLAMA_VOCAB_PRE_TYPE_CHATGLM4) {
output.push_back(151337); output.push_back(151337); // <|assistant|>
} }
} break; } break;
case LLAMA_VOCAB_TYPE_WPM: case LLAMA_VOCAB_TYPE_WPM: