llama : add pre-tokenizer regexes for BLOOM and gpt3-finnish (#8850)

This commit is contained in:
Esko Toivonen 2024-08-15 10:17:12 +03:00 committed by GitHub
parent d5492f0525
commit 6bda7ce6c3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 19 additions and 1 deletions

View file

@ -410,6 +410,8 @@ struct llm_tokenizer_bpe {
};
break;
case LLAMA_VOCAB_PRE_TYPE_PORO:
case LLAMA_VOCAB_PRE_TYPE_BLOOM:
case LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH:
regex_exprs = {
" ?[^(\\s|.,!?…。,、।۔،)]+",
};

View file

@ -5467,6 +5467,12 @@ static void llm_load_vocab(
} else if (
tokenizer_pre == "codeshell") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
} else if (
tokenizer_pre == "bloom") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_BLOOM;
} else if (
tokenizer_pre == "gpt3-finnish") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
} else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
}