Add Viking-7B tokenizer support

This commit is contained in:
Aarni Koskela 2024-05-17 14:29:22 +03:00
parent c28c99629c
commit 2c8f62fd40
4 changed files with 14 additions and 3 deletions

View file

@ -82,6 +82,7 @@ models = [
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
{"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Same for 13B and 33B
]

View file

@ -476,6 +476,9 @@ class Model:
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
res = "smaug-bpe"
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
# ref: https://huggingface.co/LumiOpen/Viking-7B
res = "viking"
if res is None:
logger.warning("\n")

View file

@ -4549,9 +4549,10 @@ static void llm_load_vocab(
tokenizer_pre == "default") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
tokenizer_pre == "llama3" ||
tokenizer_pre == "llama-v3" ||
tokenizer_pre == "llama-bpe") {
tokenizer_pre == "llama3" ||
tokenizer_pre == "llama-v3" ||
tokenizer_pre == "llama-bpe" ||
tokenizer_pre == "viking-7b") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
} else if (
tokenizer_pre == "deepseek-llm") {
@ -12580,6 +12581,11 @@ struct llm_tokenizer_bpe {
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
});
break;
case LLAMA_VOCAB_PRE_TYPE_VIKING:
word_collection = unicode_regex_split(text, {
" ?[^(\\s|[.,!?…。,、।۔،])]+",
});
break;
default:
// default regex for BPE tokenization pre-processing
word_collection = unicode_regex_split(text, {

View file

@ -86,6 +86,7 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
LLAMA_VOCAB_PRE_TYPE_VIKING = 15,
};
// note: these values should be synchronized with ggml_rope