wpm : use C locale for ispunct/isspace

This commit is contained in:
Jared Van Bortel 2024-03-25 15:52:28 -04:00
parent b06c16ef9f
commit b80c0af078

View file

@ -61,6 +61,7 @@
#include <algorithm> #include <algorithm>
#include <array> #include <array>
#include <cassert> #include <cassert>
#include <cctype>
#include <cfloat> #include <cfloat>
#include <cinttypes> #include <cinttypes>
#include <climits> #include <climits>
@ -71,7 +72,6 @@
#include <cstdio> #include <cstdio>
#include <cstring> #include <cstring>
#include <ctime> #include <ctime>
#include <cwctype>
#include <forward_list> #include <forward_list>
#include <fstream> #include <fstream>
#include <functional> #include <functional>
@ -10706,7 +10706,7 @@ struct llm_tokenizer_wpm {
std::vector<std::string> words; std::vector<std::string> words;
while (r < new_str.size()) { while (r < new_str.size()) {
// if is whitespace // if is whitespace
if (isspace(new_str[r])) { if (isspace(new_str[r], std::locale::classic())) {
if (r > l) words.push_back(new_str.substr(l, (r - l))); if (r > l) words.push_back(new_str.substr(l, (r - l)));
l = r + 1; l = r + 1;
r = l; r = l;
@ -10731,7 +10731,11 @@ struct llm_tokenizer_wpm {
} }
bool is_ascii_punct(uint32_t code) { bool is_ascii_punct(uint32_t code) {
return code < 256 && ispunct(code); if (code > 0xFF) {
return false;
}
auto c = char(static_cast<unsigned char>(code));
return ispunct(c, std::locale::classic());
} }
bool is_chinese_char(uint32_t cpt) { bool is_chinese_char(uint32_t cpt) {