diff --git a/unicode-data.cpp b/unicode-data.cpp index 01a3aa173..c839f4477 100644 --- a/unicode-data.cpp +++ b/unicode-data.cpp @@ -1666,4 +1666,4 @@ const std::map unicode_regex_equivalent_wregex = { const std::set unicode_regex_with_custom_preprocessor = { "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)" -}; \ No newline at end of file +}; diff --git a/unicode-data.h b/unicode-data.h index e19cef0bf..4d6315ab9 100644 --- a/unicode-data.h +++ b/unicode-data.h @@ -17,4 +17,4 @@ extern const std::vector> unicode_ranges_control; extern const std::multimap unicode_map_nfd; extern const std::map unicode_map_lowercase; extern const std::map unicode_regex_equivalent_wregex; -extern const std::set unicode_regex_with_custom_preprocessor; \ No newline at end of file +extern const std::set unicode_regex_with_custom_preprocessor; diff --git a/unicode.cpp b/unicode.cpp index 3214387e5..964d13606 100644 --- a/unicode.cpp +++ b/unicode.cpp @@ -197,18 +197,14 @@ static std::unordered_map unicode_utf8_to_byte_map() { return map; } -static inline std::wstring unicode_wstring_from_utf8(const std::string & s) -{ +static inline std::wstring unicode_wstring_from_utf8(const std::string & s) { std::wstring_convert> conv; return conv.from_bytes(s); } -static inline std::string unicode_wstring_to_utf8(const std::wstring & ws) -{ - // code to convert from utf32/utf16 to utf8 - std::wstring_convert, wchar_t> converter; - std::string utf8 = converter.to_bytes(ws); - return utf8; +static inline std::string unicode_wstring_to_utf8(const std::wstring & ws) { + std::wstring_convert> conv; + return conv.to_bytes(ws); } static std::vector unicode_byte_encoding_process(const std::vector & bpe_words) { @@ -233,7 +229,7 @@ static std::vector unicode_gpt2_regex_preprocess(const std::wstring & wt bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size size_t start = 0; - for(auto offset : offsets) { + for (auto offset : offsets) { const std::string text = unicode_wstring_to_utf8(std::wstring(wtext, start, offset)); std::string token = ""; @@ -248,15 +244,17 @@ static std::vector unicode_gpt2_regex_preprocess(const std::wstring & wt text_utf.reserve(text.size()); const auto cpts = unicode_cpts_from_utf8(text); - for (size_t i = 0; i < cpts.size(); ++i) + for (size_t i = 0; i < cpts.size(); ++i) { text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i])); + } for (int i = 0; i < (int)text_utf.size(); i++) { const std::string & utf_char = text_utf[i]; bool split_condition = false; int bytes_remain = text_utf.size() - i; + // forward backward lookups - const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : ""; + const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : ""; const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : ""; // handling contractions @@ -357,6 +355,7 @@ static std::vector unicode_gpt2_regex_preprocess(const std::wstring & wt token += utf_char; } } + start += offset; } @@ -402,8 +401,8 @@ static bool unicode_regex_with_custom_preprocessor_exists(const std::string & re static std::vector unicode_regex_custom_preprocess(const std::string & regex, const std::wstring & wtext, const std::vector & offsets) { std::vector bpe_offsets; - - if(regex == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") { + + if (regex == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") { bpe_offsets = unicode_gpt2_regex_preprocess(wtext, offsets); } @@ -491,16 +490,15 @@ char32_t unicode_tolower(char32_t cp) { auto it = unicode_map_lowercase.find(cp); return it == unicode_map_lowercase.end() ? cp : it->second; } - + std::vector unicode_regex_split(const std::string & text, const std::vector & regex_exprs) { std::wstring wtext = unicode_wstring_from_utf8(text); std::vector bpe_offsets = {wtext.size()}; - for(auto & regex_expr : regex_exprs) { - + for (auto & regex_expr : regex_exprs) { if (unicode_regex_equivalent_wregex_exists(regex_expr)) { - const std::wstring& wregex_expr = unicode_regex_equivalent_wregex.at(regex_expr); + const std::wstring & wregex_expr = unicode_regex_equivalent_wregex.at(regex_expr); bpe_offsets = unicode_regex_preprocess(wtext, bpe_offsets, wregex_expr); } else if (unicode_regex_with_custom_preprocessor_exists(regex_expr)) { bpe_offsets = unicode_regex_custom_preprocess(regex_expr, wtext, bpe_offsets); @@ -512,10 +510,10 @@ std::vector unicode_regex_split(const std::string & text, const std std::vector bpe_words; bpe_words.reserve(bpe_offsets.size()); // Reserve memory for the approximate size size_t start = 0; - for(size_t & offset : bpe_offsets) { + for (size_t & offset : bpe_offsets) { bpe_words.emplace_back(unicode_wstring_to_utf8(std::wstring(wtext, start, offset))); start += offset; } return unicode_byte_encoding_process(bpe_words); -} \ No newline at end of file +}