Merge branch 'glm_support'

This commit is contained in:
toyer 2024-06-25 06:26:49 +00:00
commit 3557944893

View file

@ -4836,10 +4836,6 @@ static void llm_load_vocab(
return; return;
} else if (tokenizer_model == "llama") { } else if (tokenizer_model == "llama") {
vocab.type = LLAMA_VOCAB_TYPE_SPM; vocab.type = LLAMA_VOCAB_TYPE_SPM;
// chatglm3 needs to preprocess prefix and suffix
if (tokenizer_pre == "chatglm-spm") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHATGLM3;
}
// default special tokens // default special tokens
vocab.special_bos_id = 1; vocab.special_bos_id = 1;
@ -4988,6 +4984,13 @@ static void llm_load_vocab(
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
vocab.tokenizer_add_bos = true; vocab.tokenizer_add_bos = true;
vocab.tokenizer_add_eos = false; vocab.tokenizer_add_eos = false;
// chatglm3 needs to preprocess prefix and suffix
if (tokenizer_pre == "chatglm-spm") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHATGLM3;
vocab.tokenizer_add_bos = false;
vocab.tokenizer_add_eos = false;
vocab.tokenizer_add_space_prefix = false;
}
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) { } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
vocab.tokenizer_add_bos = true; vocab.tokenizer_add_bos = true;
@ -14220,7 +14223,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
// tokenizer.encode('', add_special_tokens=False) returns [] // tokenizer.encode('', add_special_tokens=False) returns []
bool is_prev_special = false; bool is_prev_special = false;
if (add_special && vocab.tokenizer_add_bos) { if (add_special && vocab.tokenizer_add_bos) {
GGML_ASSERT(vocab.special_bos_id != -1); GGML_ASSERT(vocab.special_bos_id != -1);
output.push_back(vocab.special_bos_id); output.push_back(vocab.special_bos_id);