Fix compiler complaints

2024-08-05 23:55:17 +02:00 · 2024-08-05 23:55:17 +02:00 · 2ca313830e
commit 2ca313830e
parent 674f0faa74
2 changed files with 11 additions and 9 deletions
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@ -694,7 +694,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
            case codepoint_categ::P:     return COLLAPSE_CPT_RANGE_FIRST + ((5 << 3) | subindex);
            case codepoint_categ::S:     return COLLAPSE_CPT_RANGE_FIRST + ((6 << 3) | subindex);
            case codepoint_categ::Z:     return COLLAPSE_CPT_RANGE_FIRST + ((7 << 3) | subindex);
-            default: GGML_ASSERT(false); return COLLAPSE_CPT_RANGE_FIRST;
+            default:                     GGML_ABORT("invalid category");
        }
    };

@ -709,6 +709,8 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
        return std::pair<uint32_t, uint32_t>(collapsed, collapsed + range);
    };

+    GGML_ASSERT(sizeof(wchar_t) == sizeof(u_int32_t));
+
    const auto cpts = unicode_cpts_from_utf8(text);

    std::vector<size_t> bpe_offsets = { cpts.size() };
@ -756,7 +758,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
        wregex_whitespaces += L"\\s";
        for (uint32_t cpt : unicode_vec_whitespace) {
            if (cpt >= 0x80) {  // non-ASCII whitespaces
-                if (wregex_whitespaces.back() + 1 == cpt) {
+                if (wregex_whitespaces.back() + 1 == (wchar_t) cpt) {
                    if (*(wregex_whitespaces.end() - 2) == '-') {
                        wregex_whitespaces.back() = cpt;
                    } else {
@ -764,7 +766,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                        wregex_whitespaces += cpt;
                    }
                } else {
-                    wregex_whitespaces += cpt;
+                    wregex_whitespaces += (wchar_t) cpt;
                }
            }
        }
@ -847,7 +849,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                }
                // (2) Build a list of codepoint ranges. (2.2) [Optimization] Only build lists of ranges present in the regex.
                categ.set_flag(codepoint_categ::WHITESPACE, inside_square);  //NOTE: reusing flag 'WHITESPACE' to store 'inside square brackets'
-                regex_expr_categs.emplace_back(i, categ);
+                regex_expr_categs.emplace_back((uint32_t)i, categ);
                i += cpts_regex[i + 4] == '}' ? 4 : 5;
                continue;
            }
@ -855,7 +857,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
            if (cpt == '\\') {
                if (cpts_regex[i + 1] == 's' || cpts_regex[i + 1] == 'S') {  // \s \S
                    // (2) Build a list of codepoint ranges. (2.2) [Optimization] Only build lists of ranges present in the regex.
-                    regex_expr_categs.emplace_back(i, categ_whitespace);
+                    regex_expr_categs.emplace_back((uint32_t)i, categ_whitespace);
                    //NOTE: reusing flag 'WHITESPACE' to store 'inside square brackets'
                    regex_expr_categs.back().second.set_flag(codepoint_categ::WHITESPACE, inside_square);
                    i += 1;
@ -875,9 +877,9 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                    case 't':  ++i;  cpt = '\t';  break;
                    case 'r':  ++i;  cpt = '\r';  break;
                    case 'n':  ++i;  cpt = '\n';  break;
-                    case 'x':  GGML_ABORT("TODO");  break;  //TODO: hex values
-                    case 'u':  GGML_ABORT("TODO");  break;  //TODO: unicode values
-                    case 'U':  GGML_ABORT("TODO");  break;  //TODO: unicode values
+                    case 'x':  GGML_ABORT("TODO");  //TODO: hex values
+                    case 'u':  GGML_ABORT("TODO");  //TODO: unicode values
+                    case 'U':  GGML_ABORT("TODO");  //TODO: unicode values
                    default:  // escaped character
                        GGML_ASSERT(!is_cpt_range);
                        cpt = cpts_regex[++i];
--- a/src/unicode.h
+++ b/src/unicode.h
@ -149,7 +149,7 @@ struct codepoint_categ {
                return 0;
            }
            const char * p = strchr(subcategs, subcateg);
-            return p ? (p - subcategs + 1) : 0;
+            return (uint16_t) (p ? (p - subcategs + 1) : 0);
        };
        switch(categ) {
            case 'C':  if(subcateg == 'n') return 0;  // undefined