From 403758f93ecad225b431fafb2d938101b25e8c59 Mon Sep 17 00:00:00 2001 From: zhenweijin Date: Wed, 11 Sep 2024 09:42:55 +0800 Subject: [PATCH] refactor tokenizer --- src/llama-vocab.cpp | 246 +++++++++++++++++++++++-------------- src/llama-vocab.h | 25 ++-- src/llama.cpp | 15 ++- tests/test-tokenizer-0.cpp | 88 ++++++++----- 4 files changed, 237 insertions(+), 137 deletions(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 2c007477e..a73038af6 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -50,7 +50,7 @@ struct naive_trie { res.first->second.insert(key + 1, len - 1, value); } } - std::pair get_longest_prefix(const char * key, size_t len, size_t offset = 0) { + std::pair get_longest_prefix(const char * key, size_t len, size_t offset = 0) const { if (len == 0 || offset == len) { return std::make_pair(key, offset); } @@ -187,10 +187,17 @@ struct llm_bigram_spm { size_t size; }; -struct llm_tokenizer_spm { - llm_tokenizer_spm(const llama_vocab & vocab) : vocab(vocab) {} +struct llm_tokenizer_spm : llm_tokenizer { + llm_tokenizer_spm(const llama_vocab & vocab) : llm_tokenizer(vocab) {} +}; + +struct llm_tokenizer_spm_session { + + llm_tokenizer_spm_session(const llm_tokenizer & tokenizer) : + spm_tokenizer(static_cast(tokenizer)) {} void tokenize(const std::string & text, std::vector & output) { + // split string into utf8 chars int index = 0; size_t offs = 0; @@ -250,6 +257,7 @@ struct llm_tokenizer_spm { private: void resegment(llm_symbol & symbol, std::vector & output) { + const auto & vocab = spm_tokenizer.vocab; auto text = std::string(symbol.text, symbol.n); auto token = vocab.token_to_id.find(text); @@ -271,7 +279,7 @@ private: return; } - resegment(symbols[p->second.first], output); + resegment(symbols[p->second.first], output); resegment(symbols[p->second.second], output); } @@ -279,7 +287,7 @@ private: if (left == -1 || right == -1) { return; } - + const auto & vocab = spm_tokenizer.vocab; const std::string text = std::string(symbols[left].text, symbols[left].n + symbols[right].n); auto token = vocab.token_to_id.find(text); @@ -305,11 +313,10 @@ private: rev_merge[text] = std::make_pair(left, right); } - const llama_vocab & vocab; + const llm_tokenizer_spm & spm_tokenizer; std::vector symbols; llm_bigram_spm::queue work_queue; - std::map> rev_merge; }; @@ -352,8 +359,8 @@ struct llm_bigram_bpe { size_t size; }; -struct llm_tokenizer_bpe { - llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) { +struct llm_tokenizer_bpe : llm_tokenizer { + llm_tokenizer_bpe(const llama_vocab & vocab) : llm_tokenizer(vocab) { GGML_ASSERT(vocab.type == LLAMA_VOCAB_TYPE_BPE); switch (vocab.type_pre) { case LLAMA_VOCAB_PRE_TYPE_LLAMA3: @@ -462,11 +469,20 @@ struct llm_tokenizer_bpe { } } + std::vector regex_exprs; +}; + +struct llm_tokenizer_bpe_session { + + llm_tokenizer_bpe_session(const llm_tokenizer & tokenizer) : + bpe_tokenizer(static_cast(tokenizer)) {} + void append(const llama_vocab::id token_id, std::vector & output) const { output.push_back(token_id); } bool append_bos(std::vector & output) const { + const auto & vocab = bpe_tokenizer.vocab; if (vocab.tokenizer_add_bos) { GGML_ASSERT(vocab.special_bos_id != -1); output.push_back(vocab.special_bos_id); @@ -476,6 +492,7 @@ struct llm_tokenizer_bpe { } bool append_eos(std::vector & output) const { + const auto & vocab = bpe_tokenizer.vocab; if (vocab.tokenizer_add_eos) { GGML_ASSERT(vocab.special_eos_id != -1); output.push_back(vocab.special_eos_id); @@ -485,6 +502,7 @@ struct llm_tokenizer_bpe { } void check_double_bos_eos(const std::vector & output) const { + const auto & vocab = bpe_tokenizer.vocab; if (vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) { LLAMA_LOG_WARN( "%s: Added a BOS token to the prompt as specified by the model but the prompt " @@ -501,8 +519,8 @@ struct llm_tokenizer_bpe { void tokenize(const std::string & text, std::vector & output) { int final_prev_index = -1; - - const auto word_collection = unicode_regex_split(text, regex_exprs); + const auto word_collection = unicode_regex_split(text, bpe_tokenizer.regex_exprs); + const auto & vocab = bpe_tokenizer.vocab; symbols_final.clear(); @@ -609,7 +627,7 @@ private: if (left == -1 || right == -1) { return; } - + const auto & vocab = bpe_tokenizer.vocab; std::string left_token = std::string(symbols[left].text, symbols[left].n); std::string right_token = std::string(symbols[right].text, symbols[right].n); @@ -632,13 +650,10 @@ private: work_queue.push(bigram); } - const llama_vocab & vocab; - - std::vector regex_exprs; + const llm_tokenizer_bpe & bpe_tokenizer; std::vector symbols; std::vector symbols_final; - llm_bigram_bpe::queue work_queue; }; @@ -646,15 +661,20 @@ private: // WPM tokenizer // -struct llm_tokenizer_wpm { - llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {} +struct llm_tokenizer_wpm : llm_tokenizer { + llm_tokenizer_wpm(const llama_vocab & vocab) : llm_tokenizer(vocab) {} +}; - void tokenize(const std::string & text, std::vector & output) const { +struct llm_tokenizer_wpm_session { + + llm_tokenizer_wpm_session(const llm_tokenizer & tokenizer) + : wpm_tokenizer(static_cast(tokenizer)) {} + + void tokenize(const std::string & text, std::vector & output) { + const auto & vocab = wpm_tokenizer.vocab; const auto & token_map = vocab.token_to_id; - // normalize and split by whitespace std::vector words = preprocess(text); - // bos token prepended already // find the longest tokens that form the words @@ -751,15 +771,16 @@ struct llm_tokenizer_wpm { //(cpt >= 0xFF00 && cpt <= 0xFFEF); } - const llama_vocab & vocab; +private: + const llm_tokenizer_wpm & wpm_tokenizer; }; // // UGM tokenizer // -struct llm_tokenizer_ugm { - llm_tokenizer_ugm(const llama_vocab & vocab) : vocab(vocab) { +struct llm_tokenizer_ugm : llm_tokenizer { + llm_tokenizer_ugm(const llama_vocab & vocab) : llm_tokenizer(vocab) { if (vocab.precompiled_charsmap.size() > 0) { size_t charsmap_offset = 0; @@ -805,6 +826,31 @@ struct llm_tokenizer_ugm { unknown_token_score = min_score - unknown_token_score_penalty; } + // escaped space symbol - U+2581 (Lower One Eighth Block) + const std::string escaped_space = "\xE2\x96\x81"; + + const char * prefix_replacements = NULL; + size_t prefix_replacements_size = 0; + + const uint32_t * xcda_array = NULL; + size_t xcda_array_size = 0; + + struct naive_trie user_defined_token_matcher; + + float min_score = FLT_MAX; + float max_score = -FLT_MAX; + + float unknown_token_score_penalty = 10.0; + float unknown_token_score; + + struct naive_trie token_matcher; +}; + +struct llm_tokenizer_ugm_session { + + llm_tokenizer_ugm_session(const llm_tokenizer & tokenizer) + : ugm_tokenizer(static_cast(tokenizer)) {} + /* This implementation is based on SentencePiece optimized Viterbi algorithm for * unigram language models. The general idea is to: * - move along the input sequence in steps of one UTF code point, @@ -821,6 +867,7 @@ struct llm_tokenizer_ugm { void tokenize(const std::string & text, std::vector & output) { // get current size of output (for reversal later) size_t output_size = output.size(); + const auto & vocab = ugm_tokenizer.vocab; // normalize the input first std::string normalized; @@ -843,7 +890,7 @@ struct llm_tokenizer_ugm { // traverse the token matcher trie to find a matching token bool single_codepoint_token_found = false; const struct best_tokenization & current_best = tokenization_results[input_offset]; - const struct naive_trie * node = token_matcher.traverse(normalized[prefix_offset++]); + const struct naive_trie * node = ugm_tokenizer.token_matcher.traverse(normalized[prefix_offset++]); while (prefix_offset <= input_len && node != NULL) { // check if we found valid token in prefix @@ -873,7 +920,7 @@ struct llm_tokenizer_ugm { // if we didn't find a valid token corresponding to the whole UTF code point // then use unknown token as the tokenization of this UTF code point if (!single_codepoint_token_found) { - const double challenger_score = current_best.score_sum + unknown_token_score; + const double challenger_score = current_best.score_sum + ugm_tokenizer.unknown_token_score; prefix_offset = input_offset + n_utf8_code_units; struct best_tokenization & current_champ = tokenization_results[prefix_offset]; if (challenger_score > current_champ.score_sum) { @@ -905,7 +952,6 @@ struct llm_tokenizer_ugm { } private: - const llama_vocab & vocab; // helper structure for returning normalization results struct normalization_result { @@ -917,8 +963,9 @@ private: void normalize(const std::string& input, std::string * normalized) { normalized->clear(); normalized->reserve(input.size() * 3); + const auto & vocab = ugm_tokenizer.vocab; - const std::string space = vocab.tokenizer_escape_whitespaces ? escaped_space : " "; + const std::string space = vocab.tokenizer_escape_whitespaces ? ugm_tokenizer.escaped_space : " "; bool shall_prepend_space = !vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix; bool shall_append_space = vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix; @@ -1000,13 +1047,21 @@ private: size_t xcda_array_size; }; + // this structure stores the best tokenization so far at input_offset + struct best_tokenization { + llama_token token_id; + size_t input_offset; + float score_sum; + }; + struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) { if (input_offset == input.size()) { return { &input[input_offset], 0, 0 }; } // if input prefix matches some user-defined token return this token as normalization result - auto user_defined_token_match = user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset); + auto user_defined_token_match = + ugm_tokenizer.user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset); if (user_defined_token_match.second > 0) { return { &input[input_offset], user_defined_token_match.second, user_defined_token_match.second }; } @@ -1014,8 +1069,8 @@ private: size_t longest_prefix_length = 0; size_t longest_prefix_offset = 0; - if (xcda_array_size > 0) { - struct xcda_array_view xcda_view(xcda_array, xcda_array_size); + if (ugm_tokenizer.xcda_array_size > 0) { + struct xcda_array_view xcda_view(ugm_tokenizer.xcda_array, ugm_tokenizer.xcda_array_size); // Find the longest normalized sequence matching the input prefix by walking // the XOR-compressed compact double array (XCDA) starting from the root node @@ -1051,10 +1106,10 @@ private: if (longest_prefix_length > 0) { // we have a match, so return the replacement sequence - if (longest_prefix_offset >= prefix_replacements_size) { + if (longest_prefix_offset >= ugm_tokenizer.prefix_replacements_size) { throw std::runtime_error("Index out of array bounds in precompiled charsmap!"); } - const char * prefix_replacement = &prefix_replacements[longest_prefix_offset]; + const char * prefix_replacement = &(ugm_tokenizer.prefix_replacements)[longest_prefix_offset]; return { prefix_replacement, strlen(prefix_replacement), longest_prefix_length }; } else { // check if the input prefix contains a valid sequence of UTF-8 code units @@ -1070,31 +1125,7 @@ private: } } - // escaped space symbol - U+2581 (Lower One Eighth Block) - const std::string escaped_space = "\xE2\x96\x81"; - - const char * prefix_replacements = NULL; - size_t prefix_replacements_size = 0; - - const uint32_t * xcda_array = NULL; - size_t xcda_array_size = 0; - - struct naive_trie user_defined_token_matcher; - - // this structure stores the best tokenization so far at input_offset - struct best_tokenization { - llama_token token_id; - size_t input_offset; - float score_sum; - }; - - float min_score = FLT_MAX; - float max_score = -FLT_MAX; - - float unknown_token_score_penalty = 10.0; - float unknown_token_score; - - struct naive_trie token_matcher; + const llm_tokenizer_ugm & ugm_tokenizer; }; // @@ -1155,8 +1186,8 @@ static std::vector llama_unescape_rwkv_token(const std::string & escape return output; } -struct llm_tokenizer_rwkv { - llm_tokenizer_rwkv(const llama_vocab & vocab): vocab(vocab) { +struct llm_tokenizer_rwkv : llm_tokenizer { + llm_tokenizer_rwkv(const llama_vocab & vocab) : llm_tokenizer(vocab) { // RWKV supports arbitrary byte tokens, but the vocab struct only supports string tokens. // For now, we decode the vocab here into the lookup we'll use for tokenization. @@ -1168,11 +1199,19 @@ struct llm_tokenizer_rwkv { } } + struct naive_trie token_matcher; +}; + +struct llm_tokenizer_rwkv_session { + + llm_tokenizer_rwkv_session(const llm_tokenizer & tokenizer) + : rwkv_tokenizer(static_cast(tokenizer)) {} + void tokenize(const std::string & text, std::vector & output) { uint32_t position = 0; - + const auto & vocab = rwkv_tokenizer.vocab; while (position < text.size()) { - const struct naive_trie * node = token_matcher.traverse(text[position]); + const struct naive_trie * node = rwkv_tokenizer.token_matcher.traverse(text[position]); if (node == NULL) { // no matching token found, add unknown token output.push_back(vocab.special_unk_id); @@ -1197,9 +1236,8 @@ struct llm_tokenizer_rwkv { } } - const llama_vocab & vocab; - - struct naive_trie token_matcher; +private: + const llm_tokenizer_rwkv & rwkv_tokenizer; }; // @@ -1362,9 +1400,11 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< } } -std::vector llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special) { +std::vector llama_tokenize_internal(const llm_tokenizer * tokenizer, + std::string raw_text, bool add_special, bool parse_special) { std::vector output; std::forward_list fragment_buffer; + const llama_vocab & vocab = tokenizer->vocab; if (!raw_text.empty()) { fragment_buffer.emplace_front(raw_text, 0, raw_text.length()); @@ -1399,9 +1439,9 @@ std::vector llama_tokenize_internal(const llama_vocab & vocab, #ifdef PRETOKENIZERDEBUG LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); #endif - llm_tokenizer_spm tokenizer(vocab); llama_escape_whitespace(raw_text); - tokenizer.tokenize(raw_text, output); + llm_tokenizer_spm_session session(*tokenizer); + session.tokenize(raw_text, output); is_prev_special = false; } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) output.push_back(fragment.token); @@ -1423,10 +1463,11 @@ std::vector llama_tokenize_internal(const llama_vocab & vocab, } break; case LLAMA_VOCAB_TYPE_BPE: { - llm_tokenizer_bpe tokenizer(vocab); - + llm_tokenizer_bpe_session session(*tokenizer); + // it calls some other methods that are not exist in llm_tokenizer, + // here just cast it to bpe tokenizer object if (add_special) { - tokenizer.append_bos(output); + session.append_bos(output); } for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { @@ -1435,15 +1476,15 @@ std::vector llama_tokenize_internal(const llama_vocab & vocab, #ifdef PRETOKENIZERDEBUG LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); #endif - tokenizer.tokenize(raw_text, output); + session.tokenize(raw_text, output); } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) - tokenizer.append(fragment.token, output); + session.append(fragment.token, output); } } if (add_special) { - tokenizer.append_eos(output); - tokenizer.check_double_bos_eos(output); + session.append_eos(output); + session.check_double_bos_eos(output); } } break; case LLAMA_VOCAB_TYPE_WPM: @@ -1453,7 +1494,7 @@ std::vector llama_tokenize_internal(const llama_vocab & vocab, output.push_back(vocab.special_cls_id); } - llm_tokenizer_wpm tokenizer(vocab); + llm_tokenizer_wpm_session session(*tokenizer); for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { @@ -1462,7 +1503,7 @@ std::vector llama_tokenize_internal(const llama_vocab & vocab, #ifdef PRETOKENIZERDEBUG LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); #endif - tokenizer.tokenize(raw_text, output); + session.tokenize(raw_text, output); } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) output.push_back(fragment.token); } @@ -1475,12 +1516,11 @@ std::vector llama_tokenize_internal(const llama_vocab & vocab, } break; case LLAMA_VOCAB_TYPE_UGM: { - llm_tokenizer_ugm tokenizer(vocab); - if (add_special && vocab.tokenizer_add_bos != 0) { GGML_ASSERT(vocab.special_bos_id != -1); output.push_back(vocab.special_bos_id); } + llm_tokenizer_ugm_session session(*tokenizer); for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { @@ -1488,7 +1528,7 @@ std::vector llama_tokenize_internal(const llama_vocab & vocab, #ifdef PRETOKENIZERDEBUG LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); #endif - tokenizer.tokenize(raw_text, output); + session.tokenize(raw_text, output); } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) output.push_back(fragment.token); } @@ -1508,6 +1548,7 @@ std::vector llama_tokenize_internal(const llama_vocab & vocab, } break; case LLAMA_VOCAB_TYPE_RWKV: { + llm_tokenizer_rwkv_session session(*tokenizer); for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); @@ -1516,8 +1557,7 @@ std::vector llama_tokenize_internal(const llama_vocab & vocab, LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); #endif - llm_tokenizer_rwkv tokenizer(vocab); - tokenizer.tokenize(raw_text, output); + session.tokenize(raw_text, output); } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) output.push_back(fragment.token); } @@ -1530,6 +1570,32 @@ std::vector llama_tokenize_internal(const llama_vocab & vocab, return output; } +llm_tokenizer * llama_create_tokenizer(const llama_vocab & vocab) { + llm_tokenizer * tokenizer; + + switch (vocab.type) { + case LLAMA_VOCAB_TYPE_SPM: + tokenizer = new llm_tokenizer_spm(vocab); + break; + case LLAMA_VOCAB_TYPE_BPE: + tokenizer = new llm_tokenizer_bpe(vocab); + break; + case LLAMA_VOCAB_TYPE_WPM: + tokenizer = new llm_tokenizer_wpm(vocab); + break; + case LLAMA_VOCAB_TYPE_UGM: + tokenizer = new llm_tokenizer_ugm(vocab); + break; + case LLAMA_VOCAB_TYPE_RWKV: + tokenizer = new llm_tokenizer_rwkv(vocab); + break; + default: + GGML_ABORT("fatal error"); + } + + return tokenizer; +} + llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch) { GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE); static const char * hex = "0123456789ABCDEF"; @@ -1634,14 +1700,14 @@ llama_token llama_token_eom_impl(const struct llama_vocab & vocab) { } int32_t llama_tokenize_impl( - const struct llama_vocab & vocab, - const char * text, - int32_t text_len, - llama_token * tokens, - int32_t n_tokens_max, - bool add_special, - bool parse_special) { - auto res = llama_tokenize_internal(vocab, std::string(text, text_len), add_special, parse_special); + const llm_tokenizer * tokenizer, + const char * text, + int32_t text_len, + llama_token * tokens, + int32_t n_tokens_max, + bool add_special, + bool parse_special) { + auto res = llama_tokenize_internal(tokenizer, std::string(text, text_len), add_special, parse_special); if (n_tokens_max < (int) res.size()) { // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__); return -((int) res.size()); diff --git a/src/llama-vocab.h b/src/llama-vocab.h index dc4b5f12f..0025a8ab7 100644 --- a/src/llama-vocab.h +++ b/src/llama-vocab.h @@ -64,6 +64,13 @@ struct llama_vocab { int find_bpe_rank(const std::string & token_left, const std::string & token_right) const; }; +struct llm_tokenizer { + llm_tokenizer(const llama_vocab & vocab) : vocab(vocab) {} + virtual ~llm_tokenizer() = default; + + const llama_vocab & vocab; +}; + // // internal API // @@ -71,11 +78,13 @@ struct llama_vocab { // TODO: rename to llama_tokenize_impl // TODO: This should probably be in llama.h std::vector llama_tokenize_internal( - const llama_vocab & vocab, + const llm_tokenizer * tokenizer, std::string raw_text, bool add_special, bool parse_special = false); +llm_tokenizer * llama_create_tokenizer(const llama_vocab & vocab); + // TODO: move the API below as member functions of llama_vocab llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch); @@ -106,13 +115,13 @@ llama_token llama_token_eot_impl (const struct llama_vocab & vocab); llama_token llama_token_eom_impl (const struct llama_vocab & vocab); int32_t llama_tokenize_impl( - const struct llama_vocab & vocab, - const char * text, - int32_t text_len, - llama_token * tokens, - int32_t n_tokens_max, - bool add_special, - bool parse_special); + const llm_tokenizer * tokenizer, + const char * text, + int32_t text_len, + llama_token * tokens, + int32_t n_tokens_max, + bool add_special, + bool parse_special); // does not write null-terminator to buf int32_t llama_token_to_piece_impl( diff --git a/src/llama.cpp b/src/llama.cpp index af8afd845..6ecbe3d26 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2846,8 +2846,9 @@ struct llama_model { std::string name = "n/a"; - llama_hparams hparams = {}; - llama_vocab vocab; + llama_hparams hparams = {}; + llama_vocab vocab; + llm_tokenizer * tokenizer; struct ggml_tensor * tok_embd; struct ggml_tensor * type_embd; @@ -2923,6 +2924,8 @@ struct llama_model { while (!lora_adapters.empty()) { llama_lora_adapter_free(*lora_adapters.begin()); } + + delete tokenizer; } }; @@ -6404,6 +6407,8 @@ static void llm_load_vocab( } GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size()); + model.tokenizer = llama_create_tokenizer(vocab); + // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n' if (vocab.type == LLAMA_VOCAB_TYPE_SPM) { // For Fill-In-the-Middle (FIM)/infill models which where converted @@ -6453,11 +6458,11 @@ static void llm_load_vocab( } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) { vocab.linefeed_id = vocab.special_pad_id; } else if (vocab.type == LLAMA_VOCAB_TYPE_RWKV) { - const std::vector ids = llama_tokenize_internal(vocab, "\n", false); + const std::vector ids = llama_tokenize_internal(model.tokenizer, "\n", false); GGML_ASSERT(!ids.empty() && "model vocab missing newline token"); vocab.linefeed_id = ids[0]; } else { - const std::vector ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A + const std::vector ids = llama_tokenize_internal(model.tokenizer, "\xC4\x8A", false); // U+010A GGML_ASSERT(!ids.empty() && "model vocab missing newline token"); vocab.linefeed_id = ids[0]; } @@ -20885,7 +20890,7 @@ int32_t llama_tokenize( int32_t n_tokens_max, bool add_special, bool parse_special) { - return llama_tokenize_impl(model->vocab, text, text_len, tokens, n_tokens_max, add_special, parse_special); + return llama_tokenize_impl(model->tokenizer, text, text_len, tokens, n_tokens_max, add_special, parse_special); } int32_t llama_token_to_piece( diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index d3d21331b..4d49850c9 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -7,6 +7,7 @@ #include #include #include +#include //static const std::map> & k_tests() { // static std::map> _k_tests = { @@ -194,45 +195,64 @@ int main(int argc, char **argv) { const bool add_special = false; - for (const auto & test_kv : k_tests) { - const std::vector res = llama_tokenize(ctx, test_kv.first, add_special, false); + // multi-threaded tokenization + const int nthread = std::thread::hardware_concurrency(); + std::vector threads(nthread); - printf("\n"); - printf("src: '%s'\n", test_kv.first.c_str()); - printf("res: '%s'\n", llama_detokenize(ctx, res).c_str()); - printf("tok: "); - for (const auto & tok : res) { - printf("%d ", tok); - } - printf("\n"); + for (int i = 0; i < nthread; i++) { + threads[i] = std::thread([&, i]() { + for (const auto & test_kv : k_tests) { + const std::vector res = llama_tokenize(ctx, test_kv.first, add_special, false); - bool correct = res.size() == test_kv.second.size(); - for (int i = 0; i < (int) res.size() && correct; ++i) { - if (test_kv.second[i] != res[i]) { - correct = false; + // here only print the result of the first thread + // because the other threads are running the same tests + if (i != 0) { + continue; + } + + printf("\n"); + printf("src: '%s'\n", test_kv.first.c_str()); + printf("res: '%s'\n", llama_detokenize(ctx, res).c_str()); + printf("tok: "); + for (const auto & tok : res) { + printf("%d ", tok); + } + printf("\n"); + + bool correct = res.size() == test_kv.second.size(); + for (int i = 0; i < (int) res.size() && correct; ++i) { + if (test_kv.second[i] != res[i]) { + correct = false; + } + } + + if (!correct) { + fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str()); + fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__, + llama_detokenize(ctx, res).c_str(), + llama_detokenize(ctx, test_kv.second).c_str()); + fprintf(stderr, "%s : expected tokens: ", __func__); + for (const auto & t : test_kv.second) { + fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str()); + } + fprintf(stderr, "\n"); + fprintf(stderr, "%s : got tokens: ", __func__); + for (const auto & t : res) { + fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str()); + } + fprintf(stderr, "\n"); + + success = false; + } } - } - - if (!correct) { - fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str()); - fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__, - llama_detokenize(ctx, res).c_str(), - llama_detokenize(ctx, test_kv.second).c_str()); - fprintf(stderr, "%s : expected tokens: ", __func__); - for (const auto & t : test_kv.second) { - fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str()); - } - fprintf(stderr, "\n"); - fprintf(stderr, "%s : got tokens: ", __func__); - for (const auto & t : res) { - fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str()); - } - fprintf(stderr, "\n"); - - success = false; - } + }); } + for (int i = 0; i < nthread; i++) { + threads[i].join(); + } + + // single threaded tokenization if (!fname_text.empty()) { fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());