llama : add pre-tokenizer regexes for BLOOM and gpt3-finnish (#8850)
This commit is contained in:
parent
d5492f0525
commit
6bda7ce6c3
5 changed files with 19 additions and 1 deletions
|
@ -410,6 +410,8 @@ struct llm_tokenizer_bpe {
|
|||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_PORO:
|
||||
case LLAMA_VOCAB_PRE_TYPE_BLOOM:
|
||||
case LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH:
|
||||
regex_exprs = {
|
||||
" ?[^(\\s|.,!?…。,、।۔،)]+",
|
||||
};
|
||||
|
|
|
@ -5467,6 +5467,12 @@ static void llm_load_vocab(
|
|||
} else if (
|
||||
tokenizer_pre == "codeshell") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
|
||||
} else if (
|
||||
tokenizer_pre == "bloom") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_BLOOM;
|
||||
} else if (
|
||||
tokenizer_pre == "gpt3-finnish") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
|
||||
} else {
|
||||
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue