From ce5485aee0ffe5c7b326289037e864a129ec31d5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 27 Apr 2024 17:11:34 +0300 Subject: [PATCH] unicode : always use std::wregex --- tests/test-tokenizer-0-deepseek-llm.cpp | 6 ++--- unicode-data.cpp | 2 +- unicode.cpp | 33 +++---------------------- 3 files changed, 6 insertions(+), 35 deletions(-) diff --git a/tests/test-tokenizer-0-deepseek-llm.cpp b/tests/test-tokenizer-0-deepseek-llm.cpp index c621e02d9..e21d16c88 100644 --- a/tests/test-tokenizer-0-deepseek-llm.cpp +++ b/tests/test-tokenizer-0-deepseek-llm.cpp @@ -130,16 +130,14 @@ int main(int argc, char **argv) { llama_detokenize_bpe(ctx, test_kv.second).c_str()); fprintf(stderr, "%s : expected tokens: ", __func__); for (const auto & t : test_kv.second) { - fprintf(stderr, "%6d, ", t); + fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str()); } fprintf(stderr, "\n"); fprintf(stderr, "%s : got tokens: ", __func__); for (const auto & t : res) { - fprintf(stderr, "%6d, ", t); + fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str()); } fprintf(stderr, "\n"); - - success = false; } } diff --git a/unicode-data.cpp b/unicode-data.cpp index 526b69865..d36983601 100644 --- a/unicode-data.cpp +++ b/unicode-data.cpp @@ -1,4 +1,4 @@ -#include "unicode-data.h" +#include "unicode-data.h" #include #include diff --git a/unicode.cpp b/unicode.cpp index 2e59c0722..388e92379 100644 --- a/unicode.cpp +++ b/unicode.cpp @@ -391,35 +391,6 @@ static std::vector unicode_regex_preprocess(const std::wstring & text, c return bpe_offsets; } -static std::vector unicode_regex_preprocess_fallback(const std::string & text, const std::vector & offsets, const std::string & regex_expr) { - std::regex expr(regex_expr); - std::vector bpe_offsets; // store the offset of each word - bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size - size_t start = 0; - for (auto offset : offsets) { - std::cregex_iterator it(text.data() + start, text.data() + start + offset, expr); - std::cregex_iterator end; - - int64_t start_idx = 0; - while (it != end) { - std::cmatch match = *it; - if (match.position() > start_idx) { - bpe_offsets.emplace_back(match.position() - start_idx); - } - bpe_offsets.emplace_back(match.length()); - start_idx = match.position() + match.length(); - ++it; - } - - if (start_idx < (int64_t) offset) { - bpe_offsets.emplace_back(offset - start_idx); - } - start += offset; - } - - return bpe_offsets; -} - static bool unicode_regex_equivalent_wregex_exists(const std::string & regex) { return unicode_regex_equivalent_wregex.find(regex) != unicode_regex_equivalent_wregex.end(); } @@ -532,8 +503,10 @@ std::vector unicode_regex_split(const std::string & text, const std const std::wstring & wregex_expr = unicode_regex_equivalent_wregex.at(regex_expr); bpe_offsets = unicode_regex_preprocess(wtext, bpe_offsets, wregex_expr); } else { + // fallback try { - bpe_offsets = unicode_regex_preprocess_fallback(text, bpe_offsets, regex_expr); + const std::wstring wregex_expr = unicode_wstring_from_utf8(regex_expr); + bpe_offsets = unicode_regex_preprocess(wtext, bpe_offsets, wregex_expr); } catch (std::regex_error & e) { fprintf(stderr, "Failed to process regex: '%s'\n", regex_expr.c_str()); fprintf(stderr, "Regex error: %s\n", e.what());