diff --git a/unicode.cpp b/unicode.cpp index 2d638613f..6d6b905ea 100644 --- a/unicode.cpp +++ b/unicode.cpp @@ -681,10 +681,14 @@ std::vector unicode_regex_split(const std::string & text, const std continue; } - const int cpt_flag = unicode_cpt_flags(cpts[i]).category_flag(); + const auto flags = unicode_cpt_flags(cpts[i]); - if (k_ucat_cpt.find(cpt_flag) != k_ucat_cpt.end()) { - text_collapsed[i] = k_ucat_cpt.at(cpt_flag); + if (flags.is_whitespace) { + //NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does. + //text_collapsed[i] = (char) 0x85; // as whitespace fallback + text_collapsed[i] = (char) 0x0B; // as whitespace fallback + } else if (k_ucat_cpt.find(flags.category_flag()) != k_ucat_cpt.end()) { + text_collapsed[i] = k_ucat_cpt.at(flags.category_flag()); } else { text_collapsed[i] = (char) 0xD0; // fallback }