diff --git a/src/unicode.cpp b/src/unicode.cpp index 7cd479450..a5a377b39 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -2,10 +2,10 @@ #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING #endif +#include "ggml.h" #include "unicode.h" #include "unicode-data.h" -#include #include #include #include @@ -201,7 +201,7 @@ static std::vector unicode_regex_split_custom_gpt2(const std::string & t for (auto offset : offsets) { const size_t offset_ini = start; const size_t offset_end = start + offset; - assert(offset_end <= cpts.size()); + GGML_ASSERT(offset_end <= cpts.size()); start = offset_end; static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF; @@ -216,7 +216,7 @@ static std::vector unicode_regex_split_custom_gpt2(const std::string & t size_t _prev_end = offset_ini; auto _add_token = [&] (const size_t end) -> size_t { - assert(_prev_end <= end && end <= offset_end); + GGML_ASSERT(_prev_end <= end && end <= offset_end); size_t len = end - _prev_end; if (len > 0) { bpe_offsets.push_back(len); @@ -320,7 +320,7 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & for (auto offset : offsets) { const size_t offset_ini = start; const size_t offset_end = start + offset; - assert(offset_end <= cpts.size()); + GGML_ASSERT(offset_end <= cpts.size()); start = offset_end; static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF; @@ -335,7 +335,7 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & size_t _prev_end = offset_ini; auto _add_token = [&] (const size_t end) -> size_t { - assert(_prev_end <= end && end <= offset_end); + GGML_ASSERT(_prev_end <= end && end <= offset_end); size_t len = end - _prev_end; if (len > 0) { bpe_offsets.push_back(len); @@ -595,7 +595,7 @@ codepoint_categ unicode_cpt_category(const uint32_t cp) { cpt_categs[cpt++] = categ; } } - assert (cpt == MAX_CODEPOINTS); + GGML_ASSERT(cpt == MAX_CODEPOINTS); for (auto cpt : unicode_vec_whitespace) { cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACE); @@ -694,7 +694,7 @@ std::vector unicode_regex_split(const std::string & text, const std case codepoint_categ::P: return COLLAPSE_CPT_RANGE_FIRST + ((5 << 3) | subindex); case codepoint_categ::S: return COLLAPSE_CPT_RANGE_FIRST + ((6 << 3) | subindex); case codepoint_categ::Z: return COLLAPSE_CPT_RANGE_FIRST + ((7 << 3) | subindex); - default: assert (false); return COLLAPSE_CPT_RANGE_FIRST; + default: GGML_ASSERT(false); return COLLAPSE_CPT_RANGE_FIRST; } }; @@ -703,7 +703,7 @@ std::vector unicode_regex_split(const std::string & text, const std // \p{Ll} --> \p{Ll} to \p{Ll} // has subcategory ? yes // \p{Lu} --> \p{Lu} to \p{Lu} // has subcategory ? yes // \p{L} --> \p{Ll} to \p{Lu} // has subcategory ? no - assert ((COLLAPSE_CPT_RANGE_FIRST & 0b111) == 0); + GGML_ASSERT((COLLAPSE_CPT_RANGE_FIRST & 0b111) == 0); const uint32_t collapsed = category_to_collapsed_cpt(categ); const uint32_t range = (collapsed & 0b111) ? 0 : 0b111; // has subcategory ? return std::pair(collapsed, collapsed + range); @@ -811,17 +811,17 @@ std::vector unicode_regex_split(const std::string & text, const std continue; case '}': case ']': - assert (false); + GGML_ABORT("invalid regex"); case '(': if (cpts_regex[i + 1] == '?') { // (?: (?i: (?= (?! (?<= (? unicode_regex_split(const std::string & text, const std // parse unicode categories and subcategories if (cpt == '\\' && cpts_regex[i + 1] == 'p' && cpts_regex[i + 2] == '{') { - assert (cpts_regex[i + 3] && cpts_regex[i + 4]); + GGML_ASSERT(cpts_regex[i + 3] && cpts_regex[i + 4]); codepoint_categ categ = {}; if (cpts_regex[i + 4] == '}') { categ = codepoint_categ::from_chars((char)cpts_regex[i + 3]); } else { categ = codepoint_categ::from_chars((char)cpts_regex[i + 3], (char)cpts_regex[i + 4]); - assert (cpts_regex[i + 5] == '}'); + GGML_ASSERT(cpts_regex[i + 5] == '}'); } // (2) Build a list of codepoint ranges. (2.2) [Optimization] Only build lists of ranges present in the regex. categ.set_flag(codepoint_categ::WHITESPACE, inside_square); //NOTE: reusing flag 'WHITESPACE' to store 'inside square brackets' @@ -875,19 +875,19 @@ std::vector unicode_regex_split(const std::string & text, const std case 't': ++i; cpt = '\t'; break; case 'r': ++i; cpt = '\r'; break; case 'n': ++i; cpt = '\n'; break; - case 'x': assert (false); break; //TODO: hex values - case 'u': assert (false); break; //TODO: unicode values - case 'U': assert (false); break; //TODO: unicode values + case 'x': GGML_ABORT("TODO"); break; //TODO: hex values + case 'u': GGML_ABORT("TODO"); break; //TODO: unicode values + case 'U': GGML_ABORT("TODO"); break; //TODO: unicode values default: // escaped character - assert (!is_cpt_range); + GGML_ASSERT(!is_cpt_range); cpt = cpts_regex[++i]; - assert (cpt < 0x80); + GGML_ASSERT(cpt < 0x80); break; } } // ensure there is not a collission with any "collapsed" codepoints - assert (cpt < COLLAPSE_CPT_RANGE_FIRST || COLLAPSE_CPT_RANGE_LAST < cpt); + GGML_ASSERT(cpt < COLLAPSE_CPT_RANGE_FIRST || COLLAPSE_CPT_RANGE_LAST < cpt); // (2) Build a list of codepoint ranges if (is_cpt_range) { @@ -924,7 +924,7 @@ std::vector unicode_regex_split(const std::string & text, const std } // (1.1) Generate a replacement list of codepoint ranges codepoint_categ categ = unicode_cpt_category(range.first); - assert (categ == unicode_cpt_category(range.second)); + GGML_ASSERT(categ == unicode_cpt_category(range.second)); auto it0 = map_categ_wregex.find(categ.get_category()); auto it1 = map_categ_wregex.find(categ.get_subcategory()); for (const auto & it : {it0, it1}) { @@ -949,25 +949,25 @@ std::vector unicode_regex_split(const std::string & text, const std wregex_collapsed += (wchar_t) cpts_regex[i]; i++; } - assert (cpts_regex[i] == '\\'); + GGML_ASSERT(cpts_regex[i] == '\\'); const uint32_t cpt_next = cpts_regex[i + 1]; const bool is_negated = cpt_next < 'a'; // is uppercase if (cpt_next == 'p' || cpt_next == 'P') { - assert (cpts_regex[i + 2] == '{' && cpts_regex[i + 3]); + GGML_ASSERT(cpts_regex[i + 2] == '{' && cpts_regex[i + 3]); i += cpts_regex[i + 4] == '}' ? 5 : 6; - assert (cpts_regex[i - 1] == '}'); + GGML_ASSERT(cpts_regex[i - 1] == '}'); } else { - assert (cpt_next == 's' || cpt_next == 'w' || cpt_next == 'd' || // \s \w \d - cpt_next == 'S' || cpt_next == 'W' || cpt_next == 'D'); // \S \W \D + GGML_ASSERT(cpt_next == 's' || cpt_next == 'w' || cpt_next == 'd' || // \s \w \d + cpt_next == 'S' || cpt_next == 'W' || cpt_next == 'D'); // \S \W \D i += 2; } // (1.4) Build the "collapsed" regex replacing categories and subcategories by this "collapsed" lists. const codepoint_categ categ = offset_categ.second; auto it = map_categ_wregex.find(categ.get_subcategory()); - assert (it != map_categ_wregex.end()); + GGML_ASSERT(it != map_categ_wregex.end()); if (it != map_categ_wregex.end()) { if (categ.is_whitespace()) { // inside square brackets //NOTE: reusing flag WHITESPACE - assert (is_negated == false); + GGML_ASSERT(is_negated == false); wregex_collapsed += it->second; } else if(it->second.size() == 1 && !is_negated) { wregex_collapsed += it->second;