Fix compiler complaints

This commit is contained in:
jaime-m-p 2024-08-05 23:55:17 +02:00
parent 674f0faa74
commit 2ca313830e
2 changed files with 11 additions and 9 deletions

View file

@ -694,7 +694,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
case codepoint_categ::P: return COLLAPSE_CPT_RANGE_FIRST + ((5 << 3) | subindex); case codepoint_categ::P: return COLLAPSE_CPT_RANGE_FIRST + ((5 << 3) | subindex);
case codepoint_categ::S: return COLLAPSE_CPT_RANGE_FIRST + ((6 << 3) | subindex); case codepoint_categ::S: return COLLAPSE_CPT_RANGE_FIRST + ((6 << 3) | subindex);
case codepoint_categ::Z: return COLLAPSE_CPT_RANGE_FIRST + ((7 << 3) | subindex); case codepoint_categ::Z: return COLLAPSE_CPT_RANGE_FIRST + ((7 << 3) | subindex);
default: GGML_ASSERT(false); return COLLAPSE_CPT_RANGE_FIRST; default: GGML_ABORT("invalid category");
} }
}; };
@ -709,6 +709,8 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
return std::pair<uint32_t, uint32_t>(collapsed, collapsed + range); return std::pair<uint32_t, uint32_t>(collapsed, collapsed + range);
}; };
GGML_ASSERT(sizeof(wchar_t) == sizeof(u_int32_t));
const auto cpts = unicode_cpts_from_utf8(text); const auto cpts = unicode_cpts_from_utf8(text);
std::vector<size_t> bpe_offsets = { cpts.size() }; std::vector<size_t> bpe_offsets = { cpts.size() };
@ -756,7 +758,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
wregex_whitespaces += L"\\s"; wregex_whitespaces += L"\\s";
for (uint32_t cpt : unicode_vec_whitespace) { for (uint32_t cpt : unicode_vec_whitespace) {
if (cpt >= 0x80) { // non-ASCII whitespaces if (cpt >= 0x80) { // non-ASCII whitespaces
if (wregex_whitespaces.back() + 1 == cpt) { if (wregex_whitespaces.back() + 1 == (wchar_t) cpt) {
if (*(wregex_whitespaces.end() - 2) == '-') { if (*(wregex_whitespaces.end() - 2) == '-') {
wregex_whitespaces.back() = cpt; wregex_whitespaces.back() = cpt;
} else { } else {
@ -764,7 +766,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
wregex_whitespaces += cpt; wregex_whitespaces += cpt;
} }
} else { } else {
wregex_whitespaces += cpt; wregex_whitespaces += (wchar_t) cpt;
} }
} }
} }
@ -847,7 +849,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
} }
// (2) Build a list of codepoint ranges. (2.2) [Optimization] Only build lists of ranges present in the regex. // (2) Build a list of codepoint ranges. (2.2) [Optimization] Only build lists of ranges present in the regex.
categ.set_flag(codepoint_categ::WHITESPACE, inside_square); //NOTE: reusing flag 'WHITESPACE' to store 'inside square brackets' categ.set_flag(codepoint_categ::WHITESPACE, inside_square); //NOTE: reusing flag 'WHITESPACE' to store 'inside square brackets'
regex_expr_categs.emplace_back(i, categ); regex_expr_categs.emplace_back((uint32_t)i, categ);
i += cpts_regex[i + 4] == '}' ? 4 : 5; i += cpts_regex[i + 4] == '}' ? 4 : 5;
continue; continue;
} }
@ -855,7 +857,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
if (cpt == '\\') { if (cpt == '\\') {
if (cpts_regex[i + 1] == 's' || cpts_regex[i + 1] == 'S') { // \s \S if (cpts_regex[i + 1] == 's' || cpts_regex[i + 1] == 'S') { // \s \S
// (2) Build a list of codepoint ranges. (2.2) [Optimization] Only build lists of ranges present in the regex. // (2) Build a list of codepoint ranges. (2.2) [Optimization] Only build lists of ranges present in the regex.
regex_expr_categs.emplace_back(i, categ_whitespace); regex_expr_categs.emplace_back((uint32_t)i, categ_whitespace);
//NOTE: reusing flag 'WHITESPACE' to store 'inside square brackets' //NOTE: reusing flag 'WHITESPACE' to store 'inside square brackets'
regex_expr_categs.back().second.set_flag(codepoint_categ::WHITESPACE, inside_square); regex_expr_categs.back().second.set_flag(codepoint_categ::WHITESPACE, inside_square);
i += 1; i += 1;
@ -875,9 +877,9 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
case 't': ++i; cpt = '\t'; break; case 't': ++i; cpt = '\t'; break;
case 'r': ++i; cpt = '\r'; break; case 'r': ++i; cpt = '\r'; break;
case 'n': ++i; cpt = '\n'; break; case 'n': ++i; cpt = '\n'; break;
case 'x': GGML_ABORT("TODO"); break; //TODO: hex values case 'x': GGML_ABORT("TODO"); //TODO: hex values
case 'u': GGML_ABORT("TODO"); break; //TODO: unicode values case 'u': GGML_ABORT("TODO"); //TODO: unicode values
case 'U': GGML_ABORT("TODO"); break; //TODO: unicode values case 'U': GGML_ABORT("TODO"); //TODO: unicode values
default: // escaped character default: // escaped character
GGML_ASSERT(!is_cpt_range); GGML_ASSERT(!is_cpt_range);
cpt = cpts_regex[++i]; cpt = cpts_regex[++i];

View file

@ -149,7 +149,7 @@ struct codepoint_categ {
return 0; return 0;
} }
const char * p = strchr(subcategs, subcateg); const char * p = strchr(subcategs, subcateg);
return p ? (p - subcategs + 1) : 0; return (uint16_t) (p ? (p - subcategs + 1) : 0);
}; };
switch(categ) { switch(categ) {
case 'C': if(subcateg == 'n') return 0; // undefined case 'C': if(subcateg == 'n') return 0; // undefined