llama : use new pre-tokenizer type
This commit is contained in:
parent
9b4d63ae53
commit
43e12ce8e5
12 changed files with 87 additions and 44 deletions
|
@ -398,6 +398,9 @@ class Model(ABC):
|
||||||
if chkhsh == -3290901550109860290:
|
if chkhsh == -3290901550109860290:
|
||||||
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/main/tokenizer.json
|
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/main/tokenizer.json
|
||||||
res = "llama3"
|
res = "llama3"
|
||||||
|
if chkhsh == 5332289095291046364:
|
||||||
|
# ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-chat/blob/main/tokenizer.json
|
||||||
|
res = "deepseek-llm"
|
||||||
if chkhsh == 4190561703949727616:
|
if chkhsh == 4190561703949727616:
|
||||||
# ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct/blob/main/tokenizer.json
|
# ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct/blob/main/tokenizer.json
|
||||||
res = "deepseek-coder"
|
res = "deepseek-coder"
|
||||||
|
|
103
llama.cpp
103
llama.cpp
|
@ -316,6 +316,7 @@ enum llm_kv {
|
||||||
LLM_KV_SSM_TIME_STEP_RANK,
|
LLM_KV_SSM_TIME_STEP_RANK,
|
||||||
|
|
||||||
LLM_KV_TOKENIZER_MODEL,
|
LLM_KV_TOKENIZER_MODEL,
|
||||||
|
LLM_KV_TOKENIZER_PRE,
|
||||||
LLM_KV_TOKENIZER_LIST,
|
LLM_KV_TOKENIZER_LIST,
|
||||||
LLM_KV_TOKENIZER_TOKEN_TYPE,
|
LLM_KV_TOKENIZER_TOKEN_TYPE,
|
||||||
LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
|
LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
|
||||||
|
@ -392,6 +393,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||||
{ LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
|
{ LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
|
||||||
|
|
||||||
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
||||||
|
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
|
||||||
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
||||||
{ LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
|
{ LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
|
||||||
{ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
|
{ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
|
||||||
|
@ -2114,8 +2116,8 @@ struct llama_vocab {
|
||||||
ttype type;
|
ttype type;
|
||||||
};
|
};
|
||||||
|
|
||||||
enum llm_arch arch = LLM_ARCH_UNKNOWN;
|
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
||||||
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||||
|
|
||||||
std::unordered_map<token, id> token_to_id;
|
std::unordered_map<token, id> token_to_id;
|
||||||
std::vector<token_data> id_to_token;
|
std::vector<token_data> id_to_token;
|
||||||
|
@ -4166,11 +4168,13 @@ static void llm_load_vocab(
|
||||||
|
|
||||||
// determine vocab type
|
// determine vocab type
|
||||||
{
|
{
|
||||||
std::string tokenizer_name;
|
std::string tokenizer_model;
|
||||||
|
std::string tokenizer_pre;
|
||||||
|
|
||||||
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);
|
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
|
||||||
|
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
||||||
|
|
||||||
if (tokenizer_name == "no_vocab") {
|
if (tokenizer_model == "no_vocab") {
|
||||||
vocab.type = LLAMA_VOCAB_TYPE_NONE;
|
vocab.type = LLAMA_VOCAB_TYPE_NONE;
|
||||||
|
|
||||||
// default special tokens
|
// default special tokens
|
||||||
|
@ -4184,7 +4188,7 @@ static void llm_load_vocab(
|
||||||
vocab.linefeed_id = -1;
|
vocab.linefeed_id = -1;
|
||||||
|
|
||||||
return;
|
return;
|
||||||
} else if (tokenizer_name == "llama") {
|
} else if (tokenizer_model == "llama") {
|
||||||
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
||||||
|
|
||||||
// default special tokens
|
// default special tokens
|
||||||
|
@ -4229,7 +4233,7 @@ static void llm_load_vocab(
|
||||||
if (add_space_prefix_keyidx != -1) {
|
if (add_space_prefix_keyidx != -1) {
|
||||||
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
||||||
} // The default value of add_space_prefix is true.
|
} // The default value of add_space_prefix is true.
|
||||||
} else if (tokenizer_name == "bert") {
|
} else if (tokenizer_model == "bert") {
|
||||||
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
||||||
|
|
||||||
// default special tokens
|
// default special tokens
|
||||||
|
@ -4242,10 +4246,10 @@ static void llm_load_vocab(
|
||||||
vocab.special_mask_id = 103;
|
vocab.special_mask_id = 103;
|
||||||
vocab.add_space_prefix = false;
|
vocab.add_space_prefix = false;
|
||||||
} else {
|
} else {
|
||||||
if (tokenizer_name == "gpt2") {
|
if (tokenizer_model == "gpt2") {
|
||||||
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
||||||
} else {
|
} else {
|
||||||
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
|
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
|
||||||
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
|
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
|
||||||
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
||||||
return;
|
return;
|
||||||
|
@ -4285,7 +4289,20 @@ static void llm_load_vocab(
|
||||||
vocab.special_mask_id = -1;
|
vocab.special_mask_id = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
vocab.arch = model.arch;
|
if (tokenizer_pre.empty()) {
|
||||||
|
LLAMA_LOG_WARN("%s: missing tokenizer pre, using default tokenizer pre: 'default'", __func__);
|
||||||
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||||
|
} else if (tokenizer_pre == "default") {
|
||||||
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||||
|
} else if (tokenizer_pre == "llama3") {
|
||||||
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
||||||
|
} else if (tokenizer_pre == "deepseek-llm") {
|
||||||
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
|
||||||
|
} else if (tokenizer_pre == "deepseek-coder") {
|
||||||
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
|
||||||
|
} else {
|
||||||
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
|
const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
|
||||||
|
@ -12011,38 +12028,44 @@ struct llm_tokenizer_bpe {
|
||||||
std::vector<std::string> word_collection;
|
std::vector<std::string> word_collection;
|
||||||
switch (vocab.type) {
|
switch (vocab.type) {
|
||||||
case LLAMA_VOCAB_TYPE_BPE:
|
case LLAMA_VOCAB_TYPE_BPE:
|
||||||
switch (vocab.arch) {
|
switch (vocab.type_pre) {
|
||||||
// TODO: how to detect deepseek and llama v3 models?
|
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
|
||||||
//case LLM_ARCH_LLAMA:
|
word_collection = unicode_regex_split(text, {
|
||||||
//case LLM_ARCH_DEEPSEEK_CODER:
|
// TODO: ??????????????
|
||||||
// word_collection = unicode_regex_split(text, {
|
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+
|
||||||
// "[\r\n]",
|
"\\p{P}+",
|
||||||
// "\\s?\\p{L}+",
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||||
// "\\s?\\p{P}+",
|
"\\p{N}+",
|
||||||
// "[一-龥ࠀ-一가-]+",
|
"[0-9][0-9][0-9]"
|
||||||
// "\\p{N}+"
|
});
|
||||||
// });
|
break;
|
||||||
// break;
|
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
|
||||||
//case LLM_ARCH_DEEPSEEK_LLM:
|
word_collection = unicode_regex_split(text, {
|
||||||
// word_collection = unicode_regex_split(text, {
|
"[\r\n]",
|
||||||
// "[\r\n]",
|
"\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
|
||||||
// "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
|
"\\s?[!-/:-~!-/:-~‘-‟ -。]+",
|
||||||
// "\\s?[!-/:-~!-/:-~‘-‟ -。]+",
|
"\\s+$",
|
||||||
// "\\s+$",
|
"[一-龥ࠀ-一가-]+",
|
||||||
// "[一-龥ࠀ-一가-]+",
|
"\\p{N}+"
|
||||||
// "\\p{N}+"
|
});
|
||||||
// });
|
break;
|
||||||
// break;
|
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
|
||||||
|
word_collection = unicode_regex_split(text, {
|
||||||
|
"[\r\n]",
|
||||||
|
"\\s?\\p{L}+",
|
||||||
|
"\\s?\\p{P}+",
|
||||||
|
"[一-龥ࠀ-一가-]+",
|
||||||
|
"\\p{N}+"
|
||||||
|
});
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
// default regex for BPE tokenization pre-processing
|
// default regex for BPE tokenization pre-processing
|
||||||
{
|
word_collection = unicode_regex_split(text, {
|
||||||
word_collection = unicode_regex_split(text, {
|
"\\p{P}+",
|
||||||
"\\p{P}+",
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
"\\p{N}+",
|
||||||
"\\p{N}+",
|
"[0-9][0-9][0-9]"
|
||||||
"[0-9][0-9][0-9]"
|
});
|
||||||
});
|
|
||||||
}
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
8
llama.h
8
llama.h
|
@ -69,6 +69,14 @@ extern "C" {
|
||||||
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
|
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// pre-tokenization types
|
||||||
|
enum llama_vocab_pre_type {
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
|
||||||
|
};
|
||||||
|
|
||||||
// note: these values should be synchronized with ggml_rope
|
// note: these values should be synchronized with ggml_rope
|
||||||
// TODO: maybe move this enum to ggml.h (ggml_rope_type)
|
// TODO: maybe move this enum to ggml.h (ggml_rope_type)
|
||||||
enum llama_rope_type {
|
enum llama_rope_type {
|
||||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -41,13 +41,12 @@ llama_test(test-quantize-perf.cpp)
|
||||||
llama_test(test-sampling.cpp)
|
llama_test(test-sampling.cpp)
|
||||||
llama_test(test-chat-template.cpp)
|
llama_test(test-chat-template.cpp)
|
||||||
|
|
||||||
# TODO: tmp disabled LLaMA v3 and Deepseek tests
|
|
||||||
llama_test(test-tokenizer-0-llama.cpp NAME test-tokenizer-0-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
llama_test(test-tokenizer-0-llama.cpp NAME test-tokenizer-0-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||||
#llama_test(test-tokenizer-0-llama-v3.cpp NAME test-tokenizer-0-llama-v3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-v3.gguf)
|
llama_test(test-tokenizer-0-llama-v3.cpp NAME test-tokenizer-0-llama-v3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-v3.gguf)
|
||||||
llama_test(test-tokenizer-0-falcon.cpp NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
llama_test(test-tokenizer-0-falcon.cpp NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||||
|
|
||||||
#llama_test(test-tokenizer-0-deepseek-coder.cpp NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
|
llama_test(test-tokenizer-0-deepseek-coder.cpp NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
|
||||||
#llama_test(test-tokenizer-0-deepseek-llm.cpp NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
|
llama_test(test-tokenizer-0-deepseek-llm.cpp NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
|
||||||
|
|
||||||
llama_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
llama_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||||
llama_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
|
llama_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
|
||||||
|
|
|
@ -27,6 +27,8 @@ tests = [
|
||||||
" ",
|
" ",
|
||||||
"\t",
|
"\t",
|
||||||
"\n",
|
"\n",
|
||||||
|
"\n\n",
|
||||||
|
"\n\n\n",
|
||||||
"\t\n",
|
"\t\n",
|
||||||
"Hello world",
|
"Hello world",
|
||||||
" Hello world",
|
" Hello world",
|
||||||
|
|
|
@ -17,6 +17,8 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
|
||||||
{ " " , { 466, }, },
|
{ " " , { 466, }, },
|
||||||
{ "\t" , { 192, }, },
|
{ "\t" , { 192, }, },
|
||||||
{ "\n" , { 193, }, },
|
{ "\n" , { 193, }, },
|
||||||
|
{ "\n\n" , { 1001, }, },
|
||||||
|
{ "\n\n\n" , { 11331, }, },
|
||||||
{ "\t\n" , { 19125, }, },
|
{ "\t\n" , { 19125, }, },
|
||||||
{ "Hello world" , { 9856, 1079, }, },
|
{ "Hello world" , { 9856, 1079, }, },
|
||||||
{ " Hello world" , { 23090, 1079, }, },
|
{ " Hello world" , { 23090, 1079, }, },
|
||||||
|
|
|
@ -17,6 +17,8 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
|
||||||
{ " " , { 262, }, },
|
{ " " , { 262, }, },
|
||||||
{ "\t" , { 197, }, },
|
{ "\t" , { 197, }, },
|
||||||
{ "\n" , { 198, }, },
|
{ "\n" , { 198, }, },
|
||||||
|
{ "\n\n" , { 271, }, },
|
||||||
|
{ "\n\n\n" , { 1432, }, },
|
||||||
{ "\t\n" , { 1602, }, },
|
{ "\t\n" , { 1602, }, },
|
||||||
{ "Hello world" , { 9906, 1917, }, },
|
{ "Hello world" , { 9906, 1917, }, },
|
||||||
{ " Hello world" , { 22691, 1917, }, },
|
{ " Hello world" , { 22691, 1917, }, },
|
||||||
|
|
|
@ -17,6 +17,8 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
|
||||||
{ " " , { 268, }, },
|
{ " " , { 268, }, },
|
||||||
{ "\t" , { 29871, 12, }, },
|
{ "\t" , { 29871, 12, }, },
|
||||||
{ "\n" , { 29871, 13, }, },
|
{ "\n" , { 29871, 13, }, },
|
||||||
|
{ "\n\n" , { 29871, 13, 13, }, },
|
||||||
|
{ "\n\n\n" , { 29871, 13, 13, 13, }, },
|
||||||
{ "\t\n" , { 29871, 12, 13, }, },
|
{ "\t\n" , { 29871, 12, 13, }, },
|
||||||
{ "Hello world" , { 15043, 3186, }, },
|
{ "Hello world" , { 15043, 3186, }, },
|
||||||
{ " Hello world" , { 29871, 15043, 3186, }, },
|
{ " Hello world" , { 29871, 15043, 3186, }, },
|
||||||
|
|
|
@ -27,6 +27,8 @@ tests = [
|
||||||
" ",
|
" ",
|
||||||
"\t",
|
"\t",
|
||||||
"\n",
|
"\n",
|
||||||
|
"\n\n",
|
||||||
|
"\n\n\n",
|
||||||
"\t\n",
|
"\t\n",
|
||||||
"Hello world",
|
"Hello world",
|
||||||
" Hello world",
|
" Hello world",
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue