From 07530a8dcea8441e2a3247b90afc6b246b1d443d Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Thu, 13 Jun 2024 20:43:42 +0200 Subject: [PATCH] Fix unicode whitespaces (deepseek-coder) --- unicode.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/unicode.cpp b/unicode.cpp index 2d638613f..6d6b905ea 100644 --- a/unicode.cpp +++ b/unicode.cpp @@ -681,10 +681,14 @@ std::vector unicode_regex_split(const std::string & text, const std continue; } - const int cpt_flag = unicode_cpt_flags(cpts[i]).category_flag(); + const auto flags = unicode_cpt_flags(cpts[i]); - if (k_ucat_cpt.find(cpt_flag) != k_ucat_cpt.end()) { - text_collapsed[i] = k_ucat_cpt.at(cpt_flag); + if (flags.is_whitespace) { + //NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does. + //text_collapsed[i] = (char) 0x85; // as whitespace fallback + text_collapsed[i] = (char) 0x0B; // as whitespace fallback + } else if (k_ucat_cpt.find(flags.category_flag()) != k_ucat_cpt.end()) { + text_collapsed[i] = k_ucat_cpt.at(flags.category_flag()); } else { text_collapsed[i] = (char) 0xD0; // fallback }