Add Viking-7B tokenizer support

This commit is contained in:
Aarni Koskela 2024-05-17 14:29:22 +03:00
parent c28c99629c
commit 2c8f62fd40
4 changed files with 14 additions and 3 deletions

View file

@ -82,6 +82,7 @@ models = [
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", }, {"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", }, {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", }, {"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
{"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Same for 13B and 33B
] ]

View file

@ -476,6 +476,9 @@ class Model:
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d": if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
res = "smaug-bpe" res = "smaug-bpe"
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
# ref: https://huggingface.co/LumiOpen/Viking-7B
res = "viking"
if res is None: if res is None:
logger.warning("\n") logger.warning("\n")

View file

@ -4549,9 +4549,10 @@ static void llm_load_vocab(
tokenizer_pre == "default") { tokenizer_pre == "default") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if ( } else if (
tokenizer_pre == "llama3" || tokenizer_pre == "llama3" ||
tokenizer_pre == "llama-v3" || tokenizer_pre == "llama-v3" ||
tokenizer_pre == "llama-bpe") { tokenizer_pre == "llama-bpe" ||
tokenizer_pre == "viking-7b") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3; vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
} else if ( } else if (
tokenizer_pre == "deepseek-llm") { tokenizer_pre == "deepseek-llm") {
@ -12580,6 +12581,11 @@ struct llm_tokenizer_bpe {
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
}); });
break; break;
case LLAMA_VOCAB_PRE_TYPE_VIKING:
word_collection = unicode_regex_split(text, {
" ?[^(\\s|[.,!?…。,、।۔،])]+",
});
break;
default: default:
// default regex for BPE tokenization pre-processing // default regex for BPE tokenization pre-processing
word_collection = unicode_regex_split(text, { word_collection = unicode_regex_split(text, {

View file

@ -86,6 +86,7 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_OLMO = 12, LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
LLAMA_VOCAB_PRE_TYPE_DBRX = 13, LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14, LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
LLAMA_VOCAB_PRE_TYPE_VIKING = 15,
}; };
// note: these values should be synchronized with ggml_rope // note: these values should be synchronized with ggml_rope