llama : use new pre-tokenizer type

This commit is contained in:
Georgi Gerganov 2024-04-26 20:08:28 +03:00
parent 9b4d63ae53
commit 43e12ce8e5
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
12 changed files with 87 additions and 44 deletions

View file

@ -398,6 +398,9 @@ class Model(ABC):
if chkhsh == -3290901550109860290: if chkhsh == -3290901550109860290:
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/main/tokenizer.json # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/main/tokenizer.json
res = "llama3" res = "llama3"
if chkhsh == 5332289095291046364:
# ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-chat/blob/main/tokenizer.json
res = "deepseek-llm"
if chkhsh == 4190561703949727616: if chkhsh == 4190561703949727616:
# ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct/blob/main/tokenizer.json # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct/blob/main/tokenizer.json
res = "deepseek-coder" res = "deepseek-coder"

103
llama.cpp
View file

@ -316,6 +316,7 @@ enum llm_kv {
LLM_KV_SSM_TIME_STEP_RANK, LLM_KV_SSM_TIME_STEP_RANK,
LLM_KV_TOKENIZER_MODEL, LLM_KV_TOKENIZER_MODEL,
LLM_KV_TOKENIZER_PRE,
LLM_KV_TOKENIZER_LIST, LLM_KV_TOKENIZER_LIST,
LLM_KV_TOKENIZER_TOKEN_TYPE, LLM_KV_TOKENIZER_TOKEN_TYPE,
LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
@ -392,6 +393,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" }, { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
{ LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" }, { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
{ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" }, { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
@ -2114,8 +2116,8 @@ struct llama_vocab {
ttype type; ttype type;
}; };
enum llm_arch arch = LLM_ARCH_UNKNOWN; enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM; enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
std::unordered_map<token, id> token_to_id; std::unordered_map<token, id> token_to_id;
std::vector<token_data> id_to_token; std::vector<token_data> id_to_token;
@ -4166,11 +4168,13 @@ static void llm_load_vocab(
// determine vocab type // determine vocab type
{ {
std::string tokenizer_name; std::string tokenizer_model;
std::string tokenizer_pre;
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name); ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
if (tokenizer_name == "no_vocab") { if (tokenizer_model == "no_vocab") {
vocab.type = LLAMA_VOCAB_TYPE_NONE; vocab.type = LLAMA_VOCAB_TYPE_NONE;
// default special tokens // default special tokens
@ -4184,7 +4188,7 @@ static void llm_load_vocab(
vocab.linefeed_id = -1; vocab.linefeed_id = -1;
return; return;
} else if (tokenizer_name == "llama") { } else if (tokenizer_model == "llama") {
vocab.type = LLAMA_VOCAB_TYPE_SPM; vocab.type = LLAMA_VOCAB_TYPE_SPM;
// default special tokens // default special tokens
@ -4229,7 +4233,7 @@ static void llm_load_vocab(
if (add_space_prefix_keyidx != -1) { if (add_space_prefix_keyidx != -1) {
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx); vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
} // The default value of add_space_prefix is true. } // The default value of add_space_prefix is true.
} else if (tokenizer_name == "bert") { } else if (tokenizer_model == "bert") {
vocab.type = LLAMA_VOCAB_TYPE_WPM; vocab.type = LLAMA_VOCAB_TYPE_WPM;
// default special tokens // default special tokens
@ -4242,10 +4246,10 @@ static void llm_load_vocab(
vocab.special_mask_id = 103; vocab.special_mask_id = 103;
vocab.add_space_prefix = false; vocab.add_space_prefix = false;
} else { } else {
if (tokenizer_name == "gpt2") { if (tokenizer_model == "gpt2") {
vocab.type = LLAMA_VOCAB_TYPE_BPE; vocab.type = LLAMA_VOCAB_TYPE_BPE;
} else { } else {
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str()); LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__); LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
vocab.type = LLAMA_VOCAB_TYPE_SPM; vocab.type = LLAMA_VOCAB_TYPE_SPM;
return; return;
@ -4285,7 +4289,20 @@ static void llm_load_vocab(
vocab.special_mask_id = -1; vocab.special_mask_id = -1;
} }
vocab.arch = model.arch; if (tokenizer_pre.empty()) {
LLAMA_LOG_WARN("%s: missing tokenizer pre, using default tokenizer pre: 'default'", __func__);
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (tokenizer_pre == "default") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (tokenizer_pre == "llama3") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
} else if (tokenizer_pre == "deepseek-llm") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
} else if (tokenizer_pre == "deepseek-coder") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
} else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
}
} }
const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str()); const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
@ -12011,38 +12028,44 @@ struct llm_tokenizer_bpe {
std::vector<std::string> word_collection; std::vector<std::string> word_collection;
switch (vocab.type) { switch (vocab.type) {
case LLAMA_VOCAB_TYPE_BPE: case LLAMA_VOCAB_TYPE_BPE:
switch (vocab.arch) { switch (vocab.type_pre) {
// TODO: how to detect deepseek and llama v3 models? case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
//case LLM_ARCH_LLAMA: word_collection = unicode_regex_split(text, {
//case LLM_ARCH_DEEPSEEK_CODER: // TODO: ??????????????
// word_collection = unicode_regex_split(text, { //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+
// "[\r\n]", "\\p{P}+",
// "\\s?\\p{L}+", "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
// "\\s?\\p{P}+", "\\p{N}+",
// "[一-龥ࠀ-一가-퟿]+", "[0-9][0-9][0-9]"
// "\\p{N}+" });
// }); break;
// break; case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
//case LLM_ARCH_DEEPSEEK_LLM: word_collection = unicode_regex_split(text, {
// word_collection = unicode_regex_split(text, { "[\r\n]",
// "[\r\n]", "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ--ℝℤΩℨK--ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA--z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
// "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ--ℝℤΩℨK--ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA--z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+", "\\s?[!-/:-~---‟ -。]+",
// "\\s?[!-/:-~---‟ -。]+", "\\s+$",
// "\\s+$", "[一-龥ࠀ-一가-퟿]+",
// "[一-龥ࠀ-一가-퟿]+", "\\p{N}+"
// "\\p{N}+" });
// }); break;
// break; case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
word_collection = unicode_regex_split(text, {
"[\r\n]",
"\\s?\\p{L}+",
"\\s?\\p{P}+",
"[一-龥ࠀ-一가-퟿]+",
"\\p{N}+"
});
break;
default: default:
// default regex for BPE tokenization pre-processing // default regex for BPE tokenization pre-processing
{ word_collection = unicode_regex_split(text, {
word_collection = unicode_regex_split(text, { "\\p{P}+",
"\\p{P}+", "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", "\\p{N}+",
"\\p{N}+", "[0-9][0-9][0-9]"
"[0-9][0-9][0-9]" });
});
}
break; break;
} }
break; break;

View file

@ -69,6 +69,14 @@ extern "C" {
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
}; };
// pre-tokenization types
enum llama_vocab_pre_type {
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
};
// note: these values should be synchronized with ggml_rope // note: these values should be synchronized with ggml_rope
// TODO: maybe move this enum to ggml.h (ggml_rope_type) // TODO: maybe move this enum to ggml.h (ggml_rope_type)
enum llama_rope_type { enum llama_rope_type {

Binary file not shown.

Binary file not shown.

View file

@ -41,13 +41,12 @@ llama_test(test-quantize-perf.cpp)
llama_test(test-sampling.cpp) llama_test(test-sampling.cpp)
llama_test(test-chat-template.cpp) llama_test(test-chat-template.cpp)
# TODO: tmp disabled LLaMA v3 and Deepseek tests
llama_test(test-tokenizer-0-llama.cpp NAME test-tokenizer-0-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf) llama_test(test-tokenizer-0-llama.cpp NAME test-tokenizer-0-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
#llama_test(test-tokenizer-0-llama-v3.cpp NAME test-tokenizer-0-llama-v3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-v3.gguf) llama_test(test-tokenizer-0-llama-v3.cpp NAME test-tokenizer-0-llama-v3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-v3.gguf)
llama_test(test-tokenizer-0-falcon.cpp NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) llama_test(test-tokenizer-0-falcon.cpp NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
#llama_test(test-tokenizer-0-deepseek-coder.cpp NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf) llama_test(test-tokenizer-0-deepseek-coder.cpp NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
#llama_test(test-tokenizer-0-deepseek-llm.cpp NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf) llama_test(test-tokenizer-0-deepseek-llm.cpp NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
llama_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf) llama_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
llama_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf) llama_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)

View file

@ -27,6 +27,8 @@ tests = [
" ", " ",
"\t", "\t",
"\n", "\n",
"\n\n",
"\n\n\n",
"\t\n", "\t\n",
"Hello world", "Hello world",
" Hello world", " Hello world",

View file

@ -17,6 +17,8 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
{ " " , { 466, }, }, { " " , { 466, }, },
{ "\t" , { 192, }, }, { "\t" , { 192, }, },
{ "\n" , { 193, }, }, { "\n" , { 193, }, },
{ "\n\n" , { 1001, }, },
{ "\n\n\n" , { 11331, }, },
{ "\t\n" , { 19125, }, }, { "\t\n" , { 19125, }, },
{ "Hello world" , { 9856, 1079, }, }, { "Hello world" , { 9856, 1079, }, },
{ " Hello world" , { 23090, 1079, }, }, { " Hello world" , { 23090, 1079, }, },

View file

@ -17,6 +17,8 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
{ " " , { 262, }, }, { " " , { 262, }, },
{ "\t" , { 197, }, }, { "\t" , { 197, }, },
{ "\n" , { 198, }, }, { "\n" , { 198, }, },
{ "\n\n" , { 271, }, },
{ "\n\n\n" , { 1432, }, },
{ "\t\n" , { 1602, }, }, { "\t\n" , { 1602, }, },
{ "Hello world" , { 9906, 1917, }, }, { "Hello world" , { 9906, 1917, }, },
{ " Hello world" , { 22691, 1917, }, }, { " Hello world" , { 22691, 1917, }, },

View file

@ -17,6 +17,8 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
{ " " , { 268, }, }, { " " , { 268, }, },
{ "\t" , { 29871, 12, }, }, { "\t" , { 29871, 12, }, },
{ "\n" , { 29871, 13, }, }, { "\n" , { 29871, 13, }, },
{ "\n\n" , { 29871, 13, 13, }, },
{ "\n\n\n" , { 29871, 13, 13, 13, }, },
{ "\t\n" , { 29871, 12, 13, }, }, { "\t\n" , { 29871, 12, 13, }, },
{ "Hello world" , { 15043, 3186, }, }, { "Hello world" , { 15043, 3186, }, },
{ " Hello world" , { 29871, 15043, 3186, }, }, { " Hello world" , { 29871, 15043, 3186, }, },

View file

@ -27,6 +27,8 @@ tests = [
" ", " ",
"\t", "\t",
"\n", "\n",
"\n\n",
"\n\n\n",
"\t\n", "\t\n",
"Hello world", "Hello world",
" Hello world", " Hello world",