From e9891769ffdf0dbbbc4e6581f32b24b1ff49ac96 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 26 Apr 2024 15:09:07 +0300 Subject: [PATCH] unicode : first try custom implementations --- unicode.cpp | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/unicode.cpp b/unicode.cpp index af2d2ad34..c87b24835 100644 --- a/unicode.cpp +++ b/unicode.cpp @@ -203,14 +203,8 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) { } static inline std::string unicode_wstring_to_utf8(const std::wstring & ws) { -#if defined(_WIN32) - // code to convert from utf32/utf16 to utf8 - std::wstring_convert, wchar_t> converter; - return converter.to_bytes(ws); -#else std::wstring_convert> conv; return conv.to_bytes(ws); -#endif } static std::vector unicode_byte_encoding_process(const std::vector & bpe_words) { @@ -503,11 +497,11 @@ std::vector unicode_regex_split(const std::string & text, const std std::vector bpe_offsets = {wtext.size()}; for (auto & regex_expr : regex_exprs) { - if (unicode_regex_equivalent_wregex_exists(regex_expr)) { + if (unicode_regex_with_custom_preprocessor_exists(regex_expr)) { + bpe_offsets = unicode_regex_custom_preprocess(regex_expr, wtext, bpe_offsets); + } else if (unicode_regex_equivalent_wregex_exists(regex_expr)) { const std::wstring & wregex_expr = unicode_regex_equivalent_wregex.at(regex_expr); bpe_offsets = unicode_regex_preprocess(wtext, bpe_offsets, wregex_expr); - } else if (unicode_regex_with_custom_preprocessor_exists(regex_expr)) { - bpe_offsets = unicode_regex_custom_preprocess(regex_expr, wtext, bpe_offsets); } else { throw std::runtime_error("Unicode regex is not found"); }