From 21851c11d12ef14bd2f521e444de9bdcbdb1757d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 13 Feb 2024 10:39:21 +0200 Subject: [PATCH] tests : multi-thread the tokenizer tests ggml-ci --- llama.cpp | 24 +++++----- tests/test-tokenizer-1-bpe.cpp | 77 ++++++++++++++++---------------- tests/test-tokenizer-1-llama.cpp | 53 ++++++++++++---------- 3 files changed, 82 insertions(+), 72 deletions(-) diff --git a/llama.cpp b/llama.cpp index d316d067b..bf578399e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7759,7 +7759,7 @@ struct llm_bigram_spm { }; struct llm_tokenizer_spm { - llm_tokenizer_spm(const llama_vocab & vocab): vocab(vocab) {} + llm_tokenizer_spm(const llama_vocab & vocab) : vocab(vocab) {} void tokenize(const std::string & text, std::vector & output) { // split string into utf8 chars @@ -7834,6 +7834,7 @@ private: if (p == rev_merge.end()) { // output any symbols that did not form tokens as bytes. + output.reserve(output.size() + symbol.n); for (int j = 0; j < (int)symbol.n; ++j) { llama_vocab::id token_id = llama_byte_to_token(vocab, symbol.text[j]); output.push_back(token_id); @@ -8396,17 +8397,18 @@ struct fragment_buffer_variant { token(_token), raw_text(_dummy), offset(0), - length(0){} + length(0) {} + fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length) : type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT), - token((llama_vocab::id)-1), + token((llama_vocab::id) - 1), raw_text(_raw_text), offset(_offset), length(_length){ - GGML_ASSERT( _offset >= 0 ); - GGML_ASSERT( _length >= 1 ); - GGML_ASSERT( offset + length <= raw_text.length() ); + GGML_ASSERT(_offset >= 0); + GGML_ASSERT(_length >= 1); + GGML_ASSERT(offset + length <= raw_text.length()); } const FRAGMENT_BUFFER_VARIANT_TYPE type; @@ -8530,14 +8532,14 @@ static std::vector llama_tokenize_internal(const llama_vocab & } std::forward_list fragment_buffer; - fragment_buffer.emplace_front( raw_text, 0, raw_text.length() ); + fragment_buffer.emplace_front(raw_text, 0, raw_text.length()); - if (special) tokenizer_st_partition( vocab, fragment_buffer ); + if (special) tokenizer_st_partition(vocab, fragment_buffer); switch (vocab.type) { case LLAMA_VOCAB_TYPE_SPM: { - for (const auto & fragment: fragment_buffer) { + for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { // without adding this leading whitespace, we do not get the same results as the original tokenizer @@ -8565,7 +8567,7 @@ static std::vector llama_tokenize_internal(const llama_vocab & } break; case LLAMA_VOCAB_TYPE_BPE: { - for (const auto & fragment: fragment_buffer) { + for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); @@ -8581,7 +8583,7 @@ static std::vector llama_tokenize_internal(const llama_vocab & } break; case LLAMA_VOCAB_TYPE_WPM: { - for (const auto & fragment: fragment_buffer) { + for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp index 386530f23..3bb629561 100644 --- a/tests/test-tokenizer-1-bpe.cpp +++ b/tests/test-tokenizer-1-bpe.cpp @@ -4,13 +4,13 @@ #include "console.h" #include +#include #include #include -#include -#include -#include -#include #include +#include +#include +#include int main(int argc, char **argv) { if (argc < 2) { @@ -74,45 +74,46 @@ int main(int argc, char **argv) { } } catch (const std::invalid_argument &) { - fprintf(stderr, "%s : info: utf8 conversion %d '%s'\n", __func__, i, str.c_str()); + //fprintf(stderr, "%s : info: utf8 conversion %d '%s'\n", __func__, i, str.c_str()); } } - for (uint32_t cp = 0x0000; cp < 0xffff; ++cp) { - // NOTE: these exceptions seem to be necessary, because the GPT2 tokenizer doesn't want to interfere with some ASCII control characters - if ((cp < 0x03 || cp > 0x05) && cp != 0x0b && cp != 0x11 && (cp < 0x13 || cp > 0x17) && cp != 0x19 && (cp < 0x1c || cp > 0x1e) && (cp < 0xd800 || cp > 0xdfff)) { - std::string str = " " + codepoint_to_utf8(cp); - std::vector tokens = llama_tokenize(ctx, str, false); - std::string check = llama_detokenize_bpe(ctx, tokens); - if (str != check) { - fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n", - __func__, cp, check.c_str(), check.length(), str.c_str(), str.length()); - return 3; - } - } - } - // Restrict to assigned unicode planes - // for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) { - for (uint32_t cp = 0x10000; cp < 0x00040000; ++cp) { - std::string str = codepoint_to_utf8(cp); - std::vector tokens = llama_tokenize(ctx, str, false); - std::string check = llama_detokenize_bpe(ctx, tokens); - if (str != check) { - fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n", - __func__, cp, check.c_str(), check.length(), str.c_str(), str.length()); - return 4; - } - } - for (uint32_t cp = 0x000e0000; cp < 0x0010ffff; ++cp) { - std::string str = codepoint_to_utf8(cp); - std::vector tokens = llama_tokenize(ctx, str, false); - std::string check = llama_detokenize_bpe(ctx, tokens); - if (str != check) { - fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n", - __func__, cp, check.c_str(), check.length(), str.c_str(), str.length()); - return 4; + // unicode + { + const int nthread = std::thread::hardware_concurrency(); + + std::vector threads(nthread); + + for (int i = 0; i < nthread; ++i) { + threads[i] = std::thread([i, nthread, ctx]() { + for (uint32_t cp = i; cp < 0x0010ffff; cp += nthread) { + if (!( // NOLINT + (cp < 0x03 || cp > 0x05) && cp != 0x0b && cp != 0x11 && + (cp < 0x13 || cp > 0x17) && cp != 0x19 && + (cp < 0x1c || cp > 0x1e) && + (cp < 0xd800 || cp > 0xdfff) && + (cp < 0x00040000 || cp >= 0x000e0000) + )) { + continue; + } + + std::string str = codepoint_to_utf8(cp); + std::vector tokens = llama_tokenize(ctx, str, false); + std::string check = llama_detokenize_bpe(ctx, tokens); + if (cp != 9601 && str != check) { + fprintf(stderr, "error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n", + cp, check.c_str(), check.length(), str.c_str(), str.length()); + std::exit(3); + } + } + }); + } + + for (auto & t : threads) { + t.join(); } } + llama_free_model(model); llama_free(ctx); diff --git a/tests/test-tokenizer-1-llama.cpp b/tests/test-tokenizer-1-llama.cpp index 4b58fe495..b0d814a41 100644 --- a/tests/test-tokenizer-1-llama.cpp +++ b/tests/test-tokenizer-1-llama.cpp @@ -4,13 +4,13 @@ #include "console.h" #include +#include #include #include -#include -#include -#include -#include #include +#include +#include +#include int main(int argc, char **argv) { if (argc < 2) { @@ -72,26 +72,33 @@ int main(int argc, char **argv) { } } - for (uint32_t cp = 0x0000; cp < 0xffff; ++cp) { - if (cp < 0xd800 || cp > 0xdfff) { - std::string str = codepoint_to_utf8(cp); - std::vector tokens = llama_tokenize(ctx, str, false); - std::string check = llama_detokenize_spm(ctx, tokens); - if (cp != 9601 && str != check) { - fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n", - __func__, cp, check.c_str(), check.length(), str.c_str(), str.length()); - return 3; - } + // unicode + { + const int nthread = std::thread::hardware_concurrency(); + + std::vector threads(nthread); + + for (int i = 0; i < nthread; ++i) { + threads[i] = std::thread([i, nthread, ctx]() { + for (uint32_t cp = i; cp < 0x0010ffff; cp += nthread) { + if (cp >= 0xd800 && cp <= 0xdfff) { + continue; + } + + std::string str = codepoint_to_utf8(cp); + std::vector tokens = llama_tokenize(ctx, str, false); + std::string check = llama_detokenize_spm(ctx, tokens); + if (cp != 9601 && str != check) { + fprintf(stderr, "error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n", + cp, check.c_str(), check.length(), str.c_str(), str.length()); + std::exit(3); + } + } + }); } - } - for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) { - std::string str = codepoint_to_utf8(cp); - std::vector tokens = llama_tokenize(ctx, str, false); - std::string check = llama_detokenize_spm(ctx, tokens); - if (str != check) { - fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n", - __func__, cp, check.c_str(), check.length(), str.c_str(), str.length()); - return 4; + + for (auto & t : threads) { + t.join(); } }