wpm : portable unicode tolower (#6305)

Also use C locale for ispunct/isspace, and split unicode-data.cpp from unicode.cpp.
2024-03-26 17:46:21 -04:00 · 2024-03-26 17:46:21 -04:00 · 32c8486e1f
commit 32c8486e1f
parent 557410b8f0
9 changed files with 1699 additions and 1425 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -61,6 +61,7 @@
 #include <algorithm>
 #include <array>
 #include <cassert>
+#include <cctype>
 #include <cfloat>
 #include <cinttypes>
 #include <climits>
@ -71,7 +72,6 @@
 #include <cstdio>
 #include <cstring>
 #include <ctime>
-#include <cwctype>
 #include <forward_list>
 #include <fstream>
 #include <functional>
@ -11010,7 +11010,7 @@ struct llm_tokenizer_wpm {
            if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
                continue;
            }
-            code = to_lower(code);
+            code = unicode_tolower(code);
            if (type == CODEPOINT_TYPE_WHITESPACE) {
                code = ' ';
            }
@ -11030,7 +11030,7 @@ struct llm_tokenizer_wpm {
        std::vector<std::string> words;
        while (r < new_str.size()) {
            // if is whitespace
-            if (isspace(new_str[r])) {
+            if (isspace(new_str[r], std::locale::classic())) {
                if (r > l) words.push_back(new_str.substr(l, (r - l)));
                l = r + 1;
                r = l;
@ -11044,18 +11044,12 @@ struct llm_tokenizer_wpm {
        return words;
    }

-    uint32_t to_lower(uint32_t code) {
-        static const std::locale locale("en_US.UTF-8");
-#if defined(_WIN32)
-        if (code > 0xFFFF) {
-            return code;
-        }
-#endif
-        return std::tolower(wchar_t(code), locale);
-    }
-
    bool is_ascii_punct(uint32_t code) {
-        return code < 256 && ispunct(code);
+        if (code > 0xFF) {
+            return false;
+        }
+        auto c = char(static_cast<unsigned char>(code));
+        return ispunct(c, std::locale::classic());
    }

    bool is_chinese_char(uint32_t cpt) {