Moved regex patterns to unicode.cpp and updated unicode.h

This commit is contained in:
Kazim Abrar Mahi 2024-03-23 01:13:08 +06:00 committed by Georgi Gerganov
parent 6fbab2dbc8
commit d2cfc2225f
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
3 changed files with 80 additions and 71 deletions

View file

@ -4228,6 +4228,16 @@ static void llm_load_vocab(
if (add_space_prefix_keyidx != -1) {
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
} // The default value of add_space_prefix is true.
} else if (tokenizer_name == "bert") {
vocab.type = LLAMA_VOCAB_TYPE_WPM;
// default special tokens
vocab.special_bos_id = 101;
vocab.special_eos_id = 102;
vocab.special_unk_id = 100;
vocab.special_sep_id = -1;
vocab.special_pad_id = -1;
vocab.add_space_prefix = false;
} else {
if (tokenizer_name == "gpt2") {
vocab.type = LLAMA_VOCAB_TYPE_BPE;
@ -4272,25 +4282,6 @@ static void llm_load_vocab(
vocab.special_unk_id = -1;
vocab.special_sep_id = -1;
vocab.special_pad_id = -1;
vocab.special_cls_id = -1;
vocab.special_mask_id = -1;
} else if (tokenizer_name == "bert") {
vocab.type = LLAMA_VOCAB_TYPE_WPM;
// default special tokens
vocab.special_bos_id = -1;
vocab.special_eos_id = -1;
vocab.special_unk_id = 100;
vocab.special_sep_id = 102;
vocab.special_pad_id = 0;
vocab.special_cls_id = 101;
vocab.special_mask_id = 103;
vocab.add_space_prefix = false;
} else {
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
vocab.type = LLAMA_VOCAB_TYPE_SPM;
}
}
@ -12223,15 +12214,15 @@ private:
}
std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
return regex_bpe_preprocess(text, gpt2_regex);
return regex_bpe_preprocess(text, get_gpt2_regex());
}
std::vector<std::string> bpe_deepseek_coder_preprocess(const std::string & text) {
return regex_bpe_preprocess(text, deepseek_coder_regex);
return regex_bpe_preprocess(text, get_deepseek_coder_regex());
}
std::vector<std::string> bpe_deepseek_llm_preprocess(const std::string & text) {
return regex_bpe_preprocess(text, deepseek_llm_regex);
return regex_bpe_preprocess(text, get_deepseek_llm_regex());
}
const llama_vocab & vocab;

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long