fix falcon preprocess and add deepseek coder

This commit is contained in:
Bingxuan Wang 2023-11-14 17:34:46 +08:00
parent 5600bd8cbc
commit c31263e0cb
8 changed files with 397 additions and 27 deletions

View file

@ -8,7 +8,8 @@ BUILD_TARGETS = \
TEST_TARGETS = \ TEST_TARGETS = \
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \ tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \ tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-tokenizer-0-falcon tests/test-tokenizer-0-deepseek_coder \
tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
# Code coverage output files # Code coverage output files
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@ -69,6 +70,8 @@ test: $(TEST_TARGETS)
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \ ./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \ elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \ ./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-0-deepseek_coder" ]; then \
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \ elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
continue; \ continue; \
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \ elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
@ -712,6 +715,9 @@ tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-tokenizer-0-deepseek_coder: tests/test-tokenizer-0-deepseek_coder.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

115
llama.cpp
View file

@ -76,6 +76,7 @@
#include <sstream> #include <sstream>
#include <thread> #include <thread>
#include <unordered_map> #include <unordered_map>
#include <iostream>
#if defined(_MSC_VER) #if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
@ -2266,7 +2267,12 @@ static void llm_load_vocab(
vocab.special_sep_id = -1; vocab.special_sep_id = -1;
vocab.special_pad_id = -1; vocab.special_pad_id = -1;
} else if (tokenizer_name == "gpt2" || tokenizer_name == "deepseek_coder") { } else if (tokenizer_name == "gpt2" || tokenizer_name == "deepseek_coder") {
if(tokenizer_name == "gpt2"){
vocab.type = LLAMA_VOCAB_TYPE_BPE; vocab.type = LLAMA_VOCAB_TYPE_BPE;
}
else if (tokenizer_name == "deepseek_coder"){
vocab.type = LLAMA_VOCAB_TYPE_DEEPSEEKCODER;
}
// read bpe merges and populate bpe ranks // read bpe merges and populate bpe ranks
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str()); const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
@ -2463,7 +2469,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
// hparams // hparams
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver)); LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str()); LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : (vocab.type ==LLAMA_VOCAB_TYPE_BPE ? "BPE" : "DEEPSEEKCODER")); // TODO: fix
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab); LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size()); LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train); LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
@ -5342,6 +5348,7 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
auto buf = token_data.text.substr(3, 2); auto buf = token_data.text.substr(3, 2);
return strtol(buf.c_str(), NULL, 16); return strtol(buf.c_str(), NULL, 16);
} }
case LLAMA_VOCAB_TYPE_DEEPSEEKCODER:
case LLAMA_VOCAB_TYPE_BPE: { case LLAMA_VOCAB_TYPE_BPE: {
GGML_ASSERT(false); GGML_ASSERT(false);
return unicode_to_bytes_bpe(token_data.text); return unicode_to_bytes_bpe(token_data.text);
@ -5358,6 +5365,7 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 }; const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
return vocab.token_to_id.at(buf); return vocab.token_to_id.at(buf);
} }
case LLAMA_VOCAB_TYPE_DEEPSEEKCODER:
case LLAMA_VOCAB_TYPE_BPE: { case LLAMA_VOCAB_TYPE_BPE: {
return vocab.token_to_id.at(bytes_to_unicode_bpe(ch)); return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
} }
@ -5554,7 +5562,11 @@ struct llm_tokenizer_bpe {
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) { void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
int final_prev_index = -1; int final_prev_index = -1;
auto word_collection = bpe_gpt2_preprocess(text); std::vector<std::string> word_collection;
if(vocab.type == LLAMA_VOCAB_TYPE_BPE)
word_collection = bpe_gpt2_preprocess(text);
else if(vocab.type==LLAMA_VOCAB_TYPE_DEEPSEEKCODER)
word_collection = bpe_deepseek_coder_preprocess(text);
symbols_final.clear(); symbols_final.clear();
@ -5681,26 +5693,9 @@ private:
work_queue.push(bigram); work_queue.push(bigram);
} }
std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) { std::vector<std::string> byte_encoding_process(const std::vector<std::string> &bpe_words){
std::vector<std::string>bpe_encoded_words;
std::vector<std::string> bpe_words; for (auto word : bpe_words) {
std::vector<std::string> bpe_encoded_words;
// convert input string to wstring
std::wstring input = from_utf8(text);
std::wstring regex = from_utf8(gpt2_regex);
std::wregex expr(regex);
// std::wsmatch m;
// // use regex match to get where to split the test string
int array[] = {-1,0};
std::wsregex_token_iterator iter(input.begin(), input.end(), expr, array);
std::wsregex_token_iterator end;
for ( ; iter != end; ++iter){
if ((*iter).length()>0){
bpe_words.push_back(to_utf8(*iter));
}
}
// convert each word to utf8
for (std::string & word : bpe_words) {
std::string text_utf = ""; std::string text_utf = "";
auto utf_word = codepoints_from_utf8(word); auto utf_word = codepoints_from_utf8(word);
for (size_t i = 0; i < utf_word.size(); ++i) for (size_t i = 0; i < utf_word.size(); ++i)
@ -5712,6 +5707,80 @@ private:
} }
bpe_encoded_words.emplace_back(encoded_token); bpe_encoded_words.emplace_back(encoded_token);
} }
return bpe_encoded_words;
}
std::vector<std::string> regex_preprocess(const std::vector<std::string> &input, const std::string & regex_expr){
std::regex expr(regex_expr);
std::vector<std::string> bpe_words;
// std::wsmatch m;
// // use regex match to get where to split the test string
for(auto& text:input){
std::cregex_iterator it(text.data(), text.data() + text.size(), expr);
std::cregex_iterator end;
// Print the matches
unsigned int start_idx = 0;
while (it != end) {
std::cmatch match = *it;
std::string match_str = match.str();
if(match.position()>start_idx){
bpe_words.emplace_back(text.substr(start_idx, match.position()-start_idx));
}
bpe_words.emplace_back(match_str);
start_idx = match.position() + match.length();
++it;
}
if(start_idx < text.size()){
bpe_words.emplace_back(text.substr(start_idx, text.size()-start_idx));
}
}
return bpe_words;
}
std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
std::vector<std::string> bpe_words = {text};
for(auto & regex_expr : gpt2_regex){
bpe_words = regex_preprocess(bpe_words, regex_expr);
}
std::vector<std::string> bpe_encoded_words = byte_encoding_process(bpe_words);
return bpe_encoded_words;
}
std::vector<std::string> bpe_deepseek_coder_preprocess(const std::string & text) {
std::vector<std::string> bpe_words;
std::wstring wtext = from_utf8(text);
// extract all cjk characters
std::wregex expr(L"[\u4e00-\u9fa5\u0800-\u4e00\uac00-\ud7ff]+");
std::wcregex_iterator it(wtext.data(), wtext.data() + wtext.size(), expr);
std::wcregex_iterator end;
unsigned int start_idx = 0;
while (it != end) {
std::wcmatch match = *it;
std::wstring match_str = match.str();
if(match.position()>start_idx){
bpe_words.emplace_back(to_utf8(wtext.substr(start_idx, match.position()-start_idx)));
}
bpe_words.emplace_back(to_utf8(match_str));
start_idx = match.position() + match.length();
++it;
}
if(start_idx < wtext.size()){
bpe_words.emplace_back(to_utf8(wtext.substr(start_idx, wtext.size()-start_idx)));
}
for(auto & regex_expr : deepseek_coder_regex){
bpe_words = regex_preprocess(bpe_words, regex_expr);
}
std::vector<std::string> bpe_encoded_words = byte_encoding_process(bpe_words);
return bpe_encoded_words; return bpe_encoded_words;
} }
@ -5903,6 +5972,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
} }
} }
} break; } break;
case LLAMA_VOCAB_TYPE_DEEPSEEKCODER:
case LLAMA_VOCAB_TYPE_BPE: case LLAMA_VOCAB_TYPE_BPE:
{ {
for (const auto & fragment: fragment_buffer) for (const auto & fragment: fragment_buffer)
@ -8972,6 +9042,7 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
} }
break; break;
} }
case LLAMA_VOCAB_TYPE_DEEPSEEKCODER:
case LLAMA_VOCAB_TYPE_BPE: { case LLAMA_VOCAB_TYPE_BPE: {
if (llama_is_normal_token(model->vocab, token)) { if (llama_is_normal_token(model->vocab, token)) {
std::string result = model->vocab.id_to_token[token].text; std::string result = model->vocab.id_to_token[token].text;

View file

@ -69,6 +69,7 @@ extern "C" {
enum llama_vocab_type { enum llama_vocab_type {
LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
LLAMA_VOCAB_TYPE_DEEPSEEKCODER = 2, // deepseek coder
}; };
enum llama_token_type { enum llama_token_type {

View file

@ -0,0 +1,188 @@
#include "llama.h"
#include "common.h"
#include "console.h"
#include <cstdio>
#include <string>
#include <map>
#include <vector>
#include <fstream>
// generate using test-tokenizer-0-falcon.py
static const std::map<std::string, std::vector<llama_token>> & k_tests() {
static std::map<std::string, std::vector<llama_token>> _k_tests = {
{ "" , { }, },
{ " " , { 207, }, },
{ " " , { 243, }, },
{ " " , { 315, }, },
{ "\t" , { 184, }, },
{ "\n" , { 185, }, },
{ "\t\n" , { 184, 185, }, },
{ "Hello world" , { 17535, 1835, }, },
{ " Hello world" , { 414, 9489, 1835, }, },
{ "Hello World" , { 17535, 5414, }, },
{ " Hello World" , { 414, 9489, 5414, }, },
{ " Hello World!" , { 414, 9489, 5414, 0, }, },
{ "Hello, world!" , { 17535, 11, 1835, 0, }, },
{ " Hello, world!" , { 414, 9489, 11, 1835, 0, }, },
{ " this is 🦙.cpp" , { 437, 317, 12394, 99, 234, 13, 14789, }, },
{ "w048 7tuijk dsdfhu" , { 86, 15, 19, 23, 207, 22, 83, 3963, 27659, 26078, 3934, 14072, }, },
{ "нещо на Български" , { 1593, 6478, 616, 2251, 14994, }, },
{ "កាន់តែពិសេសអាចខលចេញ" , { 155, 239, 209, 155, 239, 114, 155, 239, 228, 155, 240, 220, 155, 239, 224, 155, 240, 211, 155, 239, 231, 155, 239, 115, 155, 239, 240, 155, 240, 210, 155, 239, 240, 155, 239, 95, 155, 239, 114, 155, 239, 214, 155, 239, 210, 155, 239, 236, 155, 239, 214, 155, 240, 210, 155, 239, 218, }, },
{ "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", { 10047, 235, 209, 334, 8760, 8, 12394, 233, 114, 350, 222, 10047, 221, 104, 169, 116, 224, 334, 4684, 3909, 992, 24330, 262, 29651, 612, 8, 207, 156, 237, 214, 334, 5950, 992, 78, 12896, 344, 638, 891, 1372, 10736, 8, }, },
{ "Hello" , { 17535, }, },
{ " Hello" , { 414, 9489, }, },
{ " Hello" , { 207, 414, 9489, }, },
{ " Hello" , { 243, 414, 9489, }, },
{ " Hello" , { 315, 414, 9489, }, },
{ " Hello\n Hello" , { 315, 414, 9489, 185, 315, 414, 9489, }, },
{ "\n =" , { 185, 405, }, },
{ "' era" , { 6, 2895, }, },
{ "Hello, y'all! How are you 😁 ?我想在apple工作1314151天", { 17535, 11, 320, 6, 435, 0, 1717, 417, 340, 12394, 233, 210, 3015, 19100, 608, 9413, 2668, 16, 18, 16, 19, 16, 20, 16, 1393, 169, 121, 239, }, },
};
return _k_tests;
}
int main(int argc, char **argv) {
if (argc < 2) {
fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
return 1;
}
const std::string fname = argv[1];
std::string fname_text;
if (argc > 2) {
fname_text = argv[2];
}
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
llama_model * model;
llama_context * ctx;
llama_backend_init(false);
// load the vocab
{
auto mparams = llama_model_default_params();
mparams.vocab_only = true;
model = llama_load_model_from_file(fname.c_str(), mparams);
if (model == NULL) {
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
return 1;
}
auto cparams = llama_context_default_params();
ctx = llama_new_context_with_model(model, cparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
llama_free_model(model);
return 1;
}
}
if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_DEEPSEEKCODER) {
fprintf(stderr, "%s : error: vocab type is not DEEPSEEKCODER\n", __func__);
llama_free_model(model);
llama_free(ctx);
return 2;
}
#ifdef _WIN32
// We need this for unicode console support
console::init(false, false);
atexit([]() { console::cleanup(); });
#endif
bool success = true;
for (const auto & test_kv : k_tests()) {
const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, false);
printf("\n");
printf("src: '%s'\n", test_kv.first.c_str());
printf("res: '%s'\n", llama_detokenize_bpe(ctx, res).c_str());
printf("tok: ");
for (const auto & tok : res) {
printf("%d ", tok);
}
printf("\n");
bool correct = res.size() == test_kv.second.size();
for (int i = 0; i < (int) res.size() && correct; ++i) {
if (test_kv.second[i] != res[i]) {
correct = false;
}
}
if (!correct) {
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
llama_detokenize_bpe(ctx, res).c_str(),
llama_detokenize_bpe(ctx, test_kv.second).c_str());
fprintf(stderr, "%s : expected tokens: ", __func__);
for (const auto & t : test_kv.second) {
fprintf(stderr, "%6d, ", t);
}
fprintf(stderr, "\n");
fprintf(stderr, "%s : got tokens: ", __func__);
for (const auto & t : res) {
fprintf(stderr, "%6d, ", t);
}
fprintf(stderr, "\n");
success = false;
}
}
if (!fname_text.empty()) {
fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
std::string text;
{
std::ifstream ifs(fname_text);
if (!ifs) {
fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
return 1;
}
text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
}
fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
const std::vector<llama_token> res = llama_tokenize(ctx, text, false);
fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
{
const std::string fname_out = fname_text + ".tokcpp";
std::ofstream ofs(fname_out);
if (!ofs) {
fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
return 1;
}
for (const auto & tok : res) {
ofs << tok << " '" << llama_detokenize_bpe(ctx, std::vector<int>{tok}) << "'" << std::endl;
}
}
fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
}
llama_free_model(model);
llama_free(ctx);
llama_backend_free();
return success ? 0 : 3;
}

View file

@ -0,0 +1,85 @@
# tests with BPE tokenizer
import os
import sys
import argparse
from transformers import AutoTokenizer
parser = argparse.ArgumentParser()
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
parser.add_argument("--fname-tok", help="path to a text file to tokenize")
args = parser.parse_args()
dir_tokenizer = args.dir_tokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
tests = [
"",
" ",
" ",
" ",
"\t",
"\n",
"\t\n",
"Hello world",
" Hello world",
"Hello World",
" Hello World",
" Hello World!",
"Hello, world!",
" Hello, world!",
" this is 🦙.cpp",
"w048 7tuijk dsdfhu",
"нещо на Български",
"កាន់តែពិសេសអាចខលចេញ",
"🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
"Hello",
" Hello",
" Hello",
" Hello",
" Hello",
" Hello\n Hello",
"\n =",
"' era",
"Hello, y'all! How are you 😁 ?我想在apple工作1314151天",
]
for text in tests:
print('text: ', text)
print(tokenizer.encode(text))
print(tokenizer.decode(tokenizer.encode(text)))
print("\n\ntests for C++:\n")
for text in tests:
res = tokenizer.encode(text)
k = text.replace('\n', '\\n')
k = k.replace('\t', '\\t')
k = '"' + k + '"'
print("{ %-24s, { " % k, end='')
for x in res:
print("%7d," % x, end='')
print(" }, },")
print(tokenizer.encode('hello'))
print(tokenizer.encode('world'))
print(tokenizer.encode(' world'))
print(tokenizer.encode('hello world'))
fname_tok = args.fname_tok
if fname_tok:
print('tokenizing file: ', fname_tok)
fname_out = fname_tok + '.tok'
with open(fname_tok, 'r', encoding='utf-8') as f:
lines = f.readlines()
s = ''.join(lines)
res = tokenizer.encode(s)
# write to file
with open(fname_out, 'w', encoding='utf-8') as f:
for x in res:
f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
print('len(res): ', len(res))
print('len(lines): ', len(lines))
print('results written to: ', fname_out)

View file

@ -38,6 +38,7 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
{ " Hello\n Hello" , { 466, 23090, 742, 23090, }, }, { " Hello\n Hello" , { 466, 23090, 742, 23090, }, },
{ "\n =" , { 1212, 40, }, }, { "\n =" , { 1212, 40, }, },
{ "' era" , { 18, 4932, }, }, { "' era" , { 18, 4932, }, },
{ "Hello, y'all! How are you 😁 ?我想在apple工作1314151天", { 9856, 23, 291, 18, 436, 12, 1265, 362, 299, 8196, 207, 204, 42, 50087, 123, 2727, 20300, 32022, 133, 234, 17419, 30137, 28, 7858, 181, 133, 236, }, },
}; };
return _k_tests; return _k_tests;
@ -115,7 +116,6 @@ int main(int argc, char **argv) {
printf("\n"); printf("\n");
bool correct = res.size() == test_kv.second.size(); bool correct = res.size() == test_kv.second.size();
for (int i = 0; i < (int) res.size() && correct; ++i) { for (int i = 0; i < (int) res.size() && correct; ++i) {
if (test_kv.second[i] != res[i]) { if (test_kv.second[i] != res[i]) {
correct = false; correct = false;

View file

@ -43,6 +43,7 @@ tests = [
" Hello\n Hello", " Hello\n Hello",
"\n =", "\n =",
"' era", "' era",
"Hello, y'all! How are you 😁 ?我想在apple工作1314151天",
] ]
for text in tests: for text in tests:

File diff suppressed because one or more lines are too long