Use GGML_ASSERT and GGML_ABORT

This commit is contained in:
jaime-m-p 2024-08-05 20:54:30 +02:00
parent 85c59df9ce
commit 735105edf9

View file

@ -2,10 +2,10 @@
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
#endif #endif
#include "ggml.h"
#include "unicode.h" #include "unicode.h"
#include "unicode-data.h" #include "unicode-data.h"
#include <cassert>
#include <cstddef> #include <cstddef>
#include <cstdint> #include <cstdint>
#include <map> #include <map>
@ -201,7 +201,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
for (auto offset : offsets) { for (auto offset : offsets) {
const size_t offset_ini = start; const size_t offset_ini = start;
const size_t offset_end = start + offset; const size_t offset_end = start + offset;
assert(offset_end <= cpts.size()); GGML_ASSERT(offset_end <= cpts.size());
start = offset_end; start = offset_end;
static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF; static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
@ -216,7 +216,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
size_t _prev_end = offset_ini; size_t _prev_end = offset_ini;
auto _add_token = [&] (const size_t end) -> size_t { auto _add_token = [&] (const size_t end) -> size_t {
assert(_prev_end <= end && end <= offset_end); GGML_ASSERT(_prev_end <= end && end <= offset_end);
size_t len = end - _prev_end; size_t len = end - _prev_end;
if (len > 0) { if (len > 0) {
bpe_offsets.push_back(len); bpe_offsets.push_back(len);
@ -320,7 +320,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
for (auto offset : offsets) { for (auto offset : offsets) {
const size_t offset_ini = start; const size_t offset_ini = start;
const size_t offset_end = start + offset; const size_t offset_end = start + offset;
assert(offset_end <= cpts.size()); GGML_ASSERT(offset_end <= cpts.size());
start = offset_end; start = offset_end;
static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF; static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
@ -335,7 +335,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
size_t _prev_end = offset_ini; size_t _prev_end = offset_ini;
auto _add_token = [&] (const size_t end) -> size_t { auto _add_token = [&] (const size_t end) -> size_t {
assert(_prev_end <= end && end <= offset_end); GGML_ASSERT(_prev_end <= end && end <= offset_end);
size_t len = end - _prev_end; size_t len = end - _prev_end;
if (len > 0) { if (len > 0) {
bpe_offsets.push_back(len); bpe_offsets.push_back(len);
@ -595,7 +595,7 @@ codepoint_categ unicode_cpt_category(const uint32_t cp) {
cpt_categs[cpt++] = categ; cpt_categs[cpt++] = categ;
} }
} }
assert (cpt == MAX_CODEPOINTS); GGML_ASSERT(cpt == MAX_CODEPOINTS);
for (auto cpt : unicode_vec_whitespace) { for (auto cpt : unicode_vec_whitespace) {
cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACE); cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACE);
@ -694,7 +694,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
case codepoint_categ::P: return COLLAPSE_CPT_RANGE_FIRST + ((5 << 3) | subindex); case codepoint_categ::P: return COLLAPSE_CPT_RANGE_FIRST + ((5 << 3) | subindex);
case codepoint_categ::S: return COLLAPSE_CPT_RANGE_FIRST + ((6 << 3) | subindex); case codepoint_categ::S: return COLLAPSE_CPT_RANGE_FIRST + ((6 << 3) | subindex);
case codepoint_categ::Z: return COLLAPSE_CPT_RANGE_FIRST + ((7 << 3) | subindex); case codepoint_categ::Z: return COLLAPSE_CPT_RANGE_FIRST + ((7 << 3) | subindex);
default: assert (false); return COLLAPSE_CPT_RANGE_FIRST; default: GGML_ASSERT(false); return COLLAPSE_CPT_RANGE_FIRST;
} }
}; };
@ -703,7 +703,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
// \p{Ll} --> \p{Ll} to \p{Ll} // has subcategory ? yes // \p{Ll} --> \p{Ll} to \p{Ll} // has subcategory ? yes
// \p{Lu} --> \p{Lu} to \p{Lu} // has subcategory ? yes // \p{Lu} --> \p{Lu} to \p{Lu} // has subcategory ? yes
// \p{L} --> \p{Ll} to \p{Lu} // has subcategory ? no // \p{L} --> \p{Ll} to \p{Lu} // has subcategory ? no
assert ((COLLAPSE_CPT_RANGE_FIRST & 0b111) == 0); GGML_ASSERT((COLLAPSE_CPT_RANGE_FIRST & 0b111) == 0);
const uint32_t collapsed = category_to_collapsed_cpt(categ); const uint32_t collapsed = category_to_collapsed_cpt(categ);
const uint32_t range = (collapsed & 0b111) ? 0 : 0b111; // has subcategory ? const uint32_t range = (collapsed & 0b111) ? 0 : 0b111; // has subcategory ?
return std::pair<uint32_t, uint32_t>(collapsed, collapsed + range); return std::pair<uint32_t, uint32_t>(collapsed, collapsed + range);
@ -811,17 +811,17 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
continue; continue;
case '}': case '}':
case ']': case ']':
assert (false); GGML_ABORT("invalid regex");
case '(': case '(':
if (cpts_regex[i + 1] == '?') { // (?: (?i: (?= (?! (?<= (?<! if (cpts_regex[i + 1] == '?') { // (?: (?i: (?= (?! (?<= (?<!
if (cpts_regex[i + 2] == ':') { if (cpts_regex[i + 2] == ':') {
i += 2; i += 2;
} else if (cpts_regex[i + 2] == 'i') { } else if (cpts_regex[i + 2] == 'i') {
i += 3; i += 3;
assert (cpts_regex[i] == ':'); GGML_ASSERT(cpts_regex[i] == ':');
} else { } else {
i += 2 + (cpts_regex[i + 2] == '<'); i += 2 + (cpts_regex[i + 2] == '<');
assert (cpts_regex[i] == '=' || cpts_regex[i] == '!'); GGML_ASSERT(cpts_regex[i] == '=' || cpts_regex[i] == '!');
} }
} }
continue; continue;
@ -837,13 +837,13 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
// parse unicode categories and subcategories // parse unicode categories and subcategories
if (cpt == '\\' && cpts_regex[i + 1] == 'p' && cpts_regex[i + 2] == '{') { if (cpt == '\\' && cpts_regex[i + 1] == 'p' && cpts_regex[i + 2] == '{') {
assert (cpts_regex[i + 3] && cpts_regex[i + 4]); GGML_ASSERT(cpts_regex[i + 3] && cpts_regex[i + 4]);
codepoint_categ categ = {}; codepoint_categ categ = {};
if (cpts_regex[i + 4] == '}') { if (cpts_regex[i + 4] == '}') {
categ = codepoint_categ::from_chars((char)cpts_regex[i + 3]); categ = codepoint_categ::from_chars((char)cpts_regex[i + 3]);
} else { } else {
categ = codepoint_categ::from_chars((char)cpts_regex[i + 3], (char)cpts_regex[i + 4]); categ = codepoint_categ::from_chars((char)cpts_regex[i + 3], (char)cpts_regex[i + 4]);
assert (cpts_regex[i + 5] == '}'); GGML_ASSERT(cpts_regex[i + 5] == '}');
} }
// (2) Build a list of codepoint ranges. (2.2) [Optimization] Only build lists of ranges present in the regex. // (2) Build a list of codepoint ranges. (2.2) [Optimization] Only build lists of ranges present in the regex.
categ.set_flag(codepoint_categ::WHITESPACE, inside_square); //NOTE: reusing flag 'WHITESPACE' to store 'inside square brackets' categ.set_flag(codepoint_categ::WHITESPACE, inside_square); //NOTE: reusing flag 'WHITESPACE' to store 'inside square brackets'
@ -875,19 +875,19 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
case 't': ++i; cpt = '\t'; break; case 't': ++i; cpt = '\t'; break;
case 'r': ++i; cpt = '\r'; break; case 'r': ++i; cpt = '\r'; break;
case 'n': ++i; cpt = '\n'; break; case 'n': ++i; cpt = '\n'; break;
case 'x': assert (false); break; //TODO: hex values case 'x': GGML_ABORT("TODO"); break; //TODO: hex values
case 'u': assert (false); break; //TODO: unicode values case 'u': GGML_ABORT("TODO"); break; //TODO: unicode values
case 'U': assert (false); break; //TODO: unicode values case 'U': GGML_ABORT("TODO"); break; //TODO: unicode values
default: // escaped character default: // escaped character
assert (!is_cpt_range); GGML_ASSERT(!is_cpt_range);
cpt = cpts_regex[++i]; cpt = cpts_regex[++i];
assert (cpt < 0x80); GGML_ASSERT(cpt < 0x80);
break; break;
} }
} }
// ensure there is not a collission with any "collapsed" codepoints // ensure there is not a collission with any "collapsed" codepoints
assert (cpt < COLLAPSE_CPT_RANGE_FIRST || COLLAPSE_CPT_RANGE_LAST < cpt); GGML_ASSERT(cpt < COLLAPSE_CPT_RANGE_FIRST || COLLAPSE_CPT_RANGE_LAST < cpt);
// (2) Build a list of codepoint ranges // (2) Build a list of codepoint ranges
if (is_cpt_range) { if (is_cpt_range) {
@ -924,7 +924,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
} }
// (1.1) Generate a replacement list of codepoint ranges // (1.1) Generate a replacement list of codepoint ranges
codepoint_categ categ = unicode_cpt_category(range.first); codepoint_categ categ = unicode_cpt_category(range.first);
assert (categ == unicode_cpt_category(range.second)); GGML_ASSERT(categ == unicode_cpt_category(range.second));
auto it0 = map_categ_wregex.find(categ.get_category()); auto it0 = map_categ_wregex.find(categ.get_category());
auto it1 = map_categ_wregex.find(categ.get_subcategory()); auto it1 = map_categ_wregex.find(categ.get_subcategory());
for (const auto & it : {it0, it1}) { for (const auto & it : {it0, it1}) {
@ -949,25 +949,25 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
wregex_collapsed += (wchar_t) cpts_regex[i]; wregex_collapsed += (wchar_t) cpts_regex[i];
i++; i++;
} }
assert (cpts_regex[i] == '\\'); GGML_ASSERT(cpts_regex[i] == '\\');
const uint32_t cpt_next = cpts_regex[i + 1]; const uint32_t cpt_next = cpts_regex[i + 1];
const bool is_negated = cpt_next < 'a'; // is uppercase const bool is_negated = cpt_next < 'a'; // is uppercase
if (cpt_next == 'p' || cpt_next == 'P') { if (cpt_next == 'p' || cpt_next == 'P') {
assert (cpts_regex[i + 2] == '{' && cpts_regex[i + 3]); GGML_ASSERT(cpts_regex[i + 2] == '{' && cpts_regex[i + 3]);
i += cpts_regex[i + 4] == '}' ? 5 : 6; i += cpts_regex[i + 4] == '}' ? 5 : 6;
assert (cpts_regex[i - 1] == '}'); GGML_ASSERT(cpts_regex[i - 1] == '}');
} else { } else {
assert (cpt_next == 's' || cpt_next == 'w' || cpt_next == 'd' || // \s \w \d GGML_ASSERT(cpt_next == 's' || cpt_next == 'w' || cpt_next == 'd' || // \s \w \d
cpt_next == 'S' || cpt_next == 'W' || cpt_next == 'D'); // \S \W \D cpt_next == 'S' || cpt_next == 'W' || cpt_next == 'D'); // \S \W \D
i += 2; i += 2;
} }
// (1.4) Build the "collapsed" regex replacing categories and subcategories by this "collapsed" lists. // (1.4) Build the "collapsed" regex replacing categories and subcategories by this "collapsed" lists.
const codepoint_categ categ = offset_categ.second; const codepoint_categ categ = offset_categ.second;
auto it = map_categ_wregex.find(categ.get_subcategory()); auto it = map_categ_wregex.find(categ.get_subcategory());
assert (it != map_categ_wregex.end()); GGML_ASSERT(it != map_categ_wregex.end());
if (it != map_categ_wregex.end()) { if (it != map_categ_wregex.end()) {
if (categ.is_whitespace()) { // inside square brackets //NOTE: reusing flag WHITESPACE if (categ.is_whitespace()) { // inside square brackets //NOTE: reusing flag WHITESPACE
assert (is_negated == false); GGML_ASSERT(is_negated == false);
wregex_collapsed += it->second; wregex_collapsed += it->second;
} else if(it->second.size() == 1 && !is_negated) { } else if(it->second.size() == 1 && !is_negated) {
wregex_collapsed += it->second; wregex_collapsed += it->second;