diff --git a/llama.cpp b/llama.cpp index 61587cb7a..59ac27f51 100644 --- a/llama.cpp +++ b/llama.cpp @@ -61,6 +61,7 @@ #include #include #include +#include #include #include #include @@ -71,7 +72,6 @@ #include #include #include -#include #include #include #include @@ -10706,7 +10706,7 @@ struct llm_tokenizer_wpm { std::vector words; while (r < new_str.size()) { // if is whitespace - if (isspace(new_str[r])) { + if (isspace(new_str[r], std::locale::classic())) { if (r > l) words.push_back(new_str.substr(l, (r - l))); l = r + 1; r = l; @@ -10731,7 +10731,11 @@ struct llm_tokenizer_wpm { } bool is_ascii_punct(uint32_t code) { - return code < 256 && ispunct(code); + if (code > 0xFF) { + return false; + } + auto c = char(static_cast(code)); + return ispunct(c, std::locale::classic()); } bool is_chinese_char(uint32_t cpt) {