Fix unicode whitespaces (deepseek-coder)
This commit is contained in:
parent
974d40b513
commit
07530a8dce
1 changed files with 7 additions and 3 deletions
10
unicode.cpp
10
unicode.cpp
|
@ -681,10 +681,14 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int cpt_flag = unicode_cpt_flags(cpts[i]).category_flag();
|
const auto flags = unicode_cpt_flags(cpts[i]);
|
||||||
|
|
||||||
if (k_ucat_cpt.find(cpt_flag) != k_ucat_cpt.end()) {
|
if (flags.is_whitespace) {
|
||||||
text_collapsed[i] = k_ucat_cpt.at(cpt_flag);
|
//NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does.
|
||||||
|
//text_collapsed[i] = (char) 0x85; // <Next Line> as whitespace fallback
|
||||||
|
text_collapsed[i] = (char) 0x0B; // <vertical tab> as whitespace fallback
|
||||||
|
} else if (k_ucat_cpt.find(flags.category_flag()) != k_ucat_cpt.end()) {
|
||||||
|
text_collapsed[i] = k_ucat_cpt.at(flags.category_flag());
|
||||||
} else {
|
} else {
|
||||||
text_collapsed[i] = (char) 0xD0; // fallback
|
text_collapsed[i] = (char) 0xD0; // fallback
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue