Use GGML_ASSERT and GGML_ABORT
This commit is contained in:
parent
85c59df9ce
commit
735105edf9
1 changed files with 27 additions and 27 deletions
|
@ -2,10 +2,10 @@
|
||||||
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
|
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
#include "unicode.h"
|
#include "unicode.h"
|
||||||
#include "unicode-data.h"
|
#include "unicode-data.h"
|
||||||
|
|
||||||
#include <cassert>
|
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <map>
|
#include <map>
|
||||||
|
@ -201,7 +201,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
||||||
for (auto offset : offsets) {
|
for (auto offset : offsets) {
|
||||||
const size_t offset_ini = start;
|
const size_t offset_ini = start;
|
||||||
const size_t offset_end = start + offset;
|
const size_t offset_end = start + offset;
|
||||||
assert(offset_end <= cpts.size());
|
GGML_ASSERT(offset_end <= cpts.size());
|
||||||
start = offset_end;
|
start = offset_end;
|
||||||
|
|
||||||
static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
|
static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
|
||||||
|
@ -216,7 +216,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
||||||
|
|
||||||
size_t _prev_end = offset_ini;
|
size_t _prev_end = offset_ini;
|
||||||
auto _add_token = [&] (const size_t end) -> size_t {
|
auto _add_token = [&] (const size_t end) -> size_t {
|
||||||
assert(_prev_end <= end && end <= offset_end);
|
GGML_ASSERT(_prev_end <= end && end <= offset_end);
|
||||||
size_t len = end - _prev_end;
|
size_t len = end - _prev_end;
|
||||||
if (len > 0) {
|
if (len > 0) {
|
||||||
bpe_offsets.push_back(len);
|
bpe_offsets.push_back(len);
|
||||||
|
@ -320,7 +320,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
||||||
for (auto offset : offsets) {
|
for (auto offset : offsets) {
|
||||||
const size_t offset_ini = start;
|
const size_t offset_ini = start;
|
||||||
const size_t offset_end = start + offset;
|
const size_t offset_end = start + offset;
|
||||||
assert(offset_end <= cpts.size());
|
GGML_ASSERT(offset_end <= cpts.size());
|
||||||
start = offset_end;
|
start = offset_end;
|
||||||
|
|
||||||
static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
|
static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
|
||||||
|
@ -335,7 +335,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
||||||
|
|
||||||
size_t _prev_end = offset_ini;
|
size_t _prev_end = offset_ini;
|
||||||
auto _add_token = [&] (const size_t end) -> size_t {
|
auto _add_token = [&] (const size_t end) -> size_t {
|
||||||
assert(_prev_end <= end && end <= offset_end);
|
GGML_ASSERT(_prev_end <= end && end <= offset_end);
|
||||||
size_t len = end - _prev_end;
|
size_t len = end - _prev_end;
|
||||||
if (len > 0) {
|
if (len > 0) {
|
||||||
bpe_offsets.push_back(len);
|
bpe_offsets.push_back(len);
|
||||||
|
@ -595,7 +595,7 @@ codepoint_categ unicode_cpt_category(const uint32_t cp) {
|
||||||
cpt_categs[cpt++] = categ;
|
cpt_categs[cpt++] = categ;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
assert (cpt == MAX_CODEPOINTS);
|
GGML_ASSERT(cpt == MAX_CODEPOINTS);
|
||||||
|
|
||||||
for (auto cpt : unicode_vec_whitespace) {
|
for (auto cpt : unicode_vec_whitespace) {
|
||||||
cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACE);
|
cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACE);
|
||||||
|
@ -694,7 +694,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
||||||
case codepoint_categ::P: return COLLAPSE_CPT_RANGE_FIRST + ((5 << 3) | subindex);
|
case codepoint_categ::P: return COLLAPSE_CPT_RANGE_FIRST + ((5 << 3) | subindex);
|
||||||
case codepoint_categ::S: return COLLAPSE_CPT_RANGE_FIRST + ((6 << 3) | subindex);
|
case codepoint_categ::S: return COLLAPSE_CPT_RANGE_FIRST + ((6 << 3) | subindex);
|
||||||
case codepoint_categ::Z: return COLLAPSE_CPT_RANGE_FIRST + ((7 << 3) | subindex);
|
case codepoint_categ::Z: return COLLAPSE_CPT_RANGE_FIRST + ((7 << 3) | subindex);
|
||||||
default: assert (false); return COLLAPSE_CPT_RANGE_FIRST;
|
default: GGML_ASSERT(false); return COLLAPSE_CPT_RANGE_FIRST;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -703,7 +703,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
||||||
// \p{Ll} --> \p{Ll} to \p{Ll} // has subcategory ? yes
|
// \p{Ll} --> \p{Ll} to \p{Ll} // has subcategory ? yes
|
||||||
// \p{Lu} --> \p{Lu} to \p{Lu} // has subcategory ? yes
|
// \p{Lu} --> \p{Lu} to \p{Lu} // has subcategory ? yes
|
||||||
// \p{L} --> \p{Ll} to \p{Lu} // has subcategory ? no
|
// \p{L} --> \p{Ll} to \p{Lu} // has subcategory ? no
|
||||||
assert ((COLLAPSE_CPT_RANGE_FIRST & 0b111) == 0);
|
GGML_ASSERT((COLLAPSE_CPT_RANGE_FIRST & 0b111) == 0);
|
||||||
const uint32_t collapsed = category_to_collapsed_cpt(categ);
|
const uint32_t collapsed = category_to_collapsed_cpt(categ);
|
||||||
const uint32_t range = (collapsed & 0b111) ? 0 : 0b111; // has subcategory ?
|
const uint32_t range = (collapsed & 0b111) ? 0 : 0b111; // has subcategory ?
|
||||||
return std::pair<uint32_t, uint32_t>(collapsed, collapsed + range);
|
return std::pair<uint32_t, uint32_t>(collapsed, collapsed + range);
|
||||||
|
@ -811,17 +811,17 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
||||||
continue;
|
continue;
|
||||||
case '}':
|
case '}':
|
||||||
case ']':
|
case ']':
|
||||||
assert (false);
|
GGML_ABORT("invalid regex");
|
||||||
case '(':
|
case '(':
|
||||||
if (cpts_regex[i + 1] == '?') { // (?: (?i: (?= (?! (?<= (?<!
|
if (cpts_regex[i + 1] == '?') { // (?: (?i: (?= (?! (?<= (?<!
|
||||||
if (cpts_regex[i + 2] == ':') {
|
if (cpts_regex[i + 2] == ':') {
|
||||||
i += 2;
|
i += 2;
|
||||||
} else if (cpts_regex[i + 2] == 'i') {
|
} else if (cpts_regex[i + 2] == 'i') {
|
||||||
i += 3;
|
i += 3;
|
||||||
assert (cpts_regex[i] == ':');
|
GGML_ASSERT(cpts_regex[i] == ':');
|
||||||
} else {
|
} else {
|
||||||
i += 2 + (cpts_regex[i + 2] == '<');
|
i += 2 + (cpts_regex[i + 2] == '<');
|
||||||
assert (cpts_regex[i] == '=' || cpts_regex[i] == '!');
|
GGML_ASSERT(cpts_regex[i] == '=' || cpts_regex[i] == '!');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
|
@ -837,13 +837,13 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
||||||
|
|
||||||
// parse unicode categories and subcategories
|
// parse unicode categories and subcategories
|
||||||
if (cpt == '\\' && cpts_regex[i + 1] == 'p' && cpts_regex[i + 2] == '{') {
|
if (cpt == '\\' && cpts_regex[i + 1] == 'p' && cpts_regex[i + 2] == '{') {
|
||||||
assert (cpts_regex[i + 3] && cpts_regex[i + 4]);
|
GGML_ASSERT(cpts_regex[i + 3] && cpts_regex[i + 4]);
|
||||||
codepoint_categ categ = {};
|
codepoint_categ categ = {};
|
||||||
if (cpts_regex[i + 4] == '}') {
|
if (cpts_regex[i + 4] == '}') {
|
||||||
categ = codepoint_categ::from_chars((char)cpts_regex[i + 3]);
|
categ = codepoint_categ::from_chars((char)cpts_regex[i + 3]);
|
||||||
} else {
|
} else {
|
||||||
categ = codepoint_categ::from_chars((char)cpts_regex[i + 3], (char)cpts_regex[i + 4]);
|
categ = codepoint_categ::from_chars((char)cpts_regex[i + 3], (char)cpts_regex[i + 4]);
|
||||||
assert (cpts_regex[i + 5] == '}');
|
GGML_ASSERT(cpts_regex[i + 5] == '}');
|
||||||
}
|
}
|
||||||
// (2) Build a list of codepoint ranges. (2.2) [Optimization] Only build lists of ranges present in the regex.
|
// (2) Build a list of codepoint ranges. (2.2) [Optimization] Only build lists of ranges present in the regex.
|
||||||
categ.set_flag(codepoint_categ::WHITESPACE, inside_square); //NOTE: reusing flag 'WHITESPACE' to store 'inside square brackets'
|
categ.set_flag(codepoint_categ::WHITESPACE, inside_square); //NOTE: reusing flag 'WHITESPACE' to store 'inside square brackets'
|
||||||
|
@ -875,19 +875,19 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
||||||
case 't': ++i; cpt = '\t'; break;
|
case 't': ++i; cpt = '\t'; break;
|
||||||
case 'r': ++i; cpt = '\r'; break;
|
case 'r': ++i; cpt = '\r'; break;
|
||||||
case 'n': ++i; cpt = '\n'; break;
|
case 'n': ++i; cpt = '\n'; break;
|
||||||
case 'x': assert (false); break; //TODO: hex values
|
case 'x': GGML_ABORT("TODO"); break; //TODO: hex values
|
||||||
case 'u': assert (false); break; //TODO: unicode values
|
case 'u': GGML_ABORT("TODO"); break; //TODO: unicode values
|
||||||
case 'U': assert (false); break; //TODO: unicode values
|
case 'U': GGML_ABORT("TODO"); break; //TODO: unicode values
|
||||||
default: // escaped character
|
default: // escaped character
|
||||||
assert (!is_cpt_range);
|
GGML_ASSERT(!is_cpt_range);
|
||||||
cpt = cpts_regex[++i];
|
cpt = cpts_regex[++i];
|
||||||
assert (cpt < 0x80);
|
GGML_ASSERT(cpt < 0x80);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ensure there is not a collission with any "collapsed" codepoints
|
// ensure there is not a collission with any "collapsed" codepoints
|
||||||
assert (cpt < COLLAPSE_CPT_RANGE_FIRST || COLLAPSE_CPT_RANGE_LAST < cpt);
|
GGML_ASSERT(cpt < COLLAPSE_CPT_RANGE_FIRST || COLLAPSE_CPT_RANGE_LAST < cpt);
|
||||||
|
|
||||||
// (2) Build a list of codepoint ranges
|
// (2) Build a list of codepoint ranges
|
||||||
if (is_cpt_range) {
|
if (is_cpt_range) {
|
||||||
|
@ -924,7 +924,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
||||||
}
|
}
|
||||||
// (1.1) Generate a replacement list of codepoint ranges
|
// (1.1) Generate a replacement list of codepoint ranges
|
||||||
codepoint_categ categ = unicode_cpt_category(range.first);
|
codepoint_categ categ = unicode_cpt_category(range.first);
|
||||||
assert (categ == unicode_cpt_category(range.second));
|
GGML_ASSERT(categ == unicode_cpt_category(range.second));
|
||||||
auto it0 = map_categ_wregex.find(categ.get_category());
|
auto it0 = map_categ_wregex.find(categ.get_category());
|
||||||
auto it1 = map_categ_wregex.find(categ.get_subcategory());
|
auto it1 = map_categ_wregex.find(categ.get_subcategory());
|
||||||
for (const auto & it : {it0, it1}) {
|
for (const auto & it : {it0, it1}) {
|
||||||
|
@ -949,25 +949,25 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
||||||
wregex_collapsed += (wchar_t) cpts_regex[i];
|
wregex_collapsed += (wchar_t) cpts_regex[i];
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
assert (cpts_regex[i] == '\\');
|
GGML_ASSERT(cpts_regex[i] == '\\');
|
||||||
const uint32_t cpt_next = cpts_regex[i + 1];
|
const uint32_t cpt_next = cpts_regex[i + 1];
|
||||||
const bool is_negated = cpt_next < 'a'; // is uppercase
|
const bool is_negated = cpt_next < 'a'; // is uppercase
|
||||||
if (cpt_next == 'p' || cpt_next == 'P') {
|
if (cpt_next == 'p' || cpt_next == 'P') {
|
||||||
assert (cpts_regex[i + 2] == '{' && cpts_regex[i + 3]);
|
GGML_ASSERT(cpts_regex[i + 2] == '{' && cpts_regex[i + 3]);
|
||||||
i += cpts_regex[i + 4] == '}' ? 5 : 6;
|
i += cpts_regex[i + 4] == '}' ? 5 : 6;
|
||||||
assert (cpts_regex[i - 1] == '}');
|
GGML_ASSERT(cpts_regex[i - 1] == '}');
|
||||||
} else {
|
} else {
|
||||||
assert (cpt_next == 's' || cpt_next == 'w' || cpt_next == 'd' || // \s \w \d
|
GGML_ASSERT(cpt_next == 's' || cpt_next == 'w' || cpt_next == 'd' || // \s \w \d
|
||||||
cpt_next == 'S' || cpt_next == 'W' || cpt_next == 'D'); // \S \W \D
|
cpt_next == 'S' || cpt_next == 'W' || cpt_next == 'D'); // \S \W \D
|
||||||
i += 2;
|
i += 2;
|
||||||
}
|
}
|
||||||
// (1.4) Build the "collapsed" regex replacing categories and subcategories by this "collapsed" lists.
|
// (1.4) Build the "collapsed" regex replacing categories and subcategories by this "collapsed" lists.
|
||||||
const codepoint_categ categ = offset_categ.second;
|
const codepoint_categ categ = offset_categ.second;
|
||||||
auto it = map_categ_wregex.find(categ.get_subcategory());
|
auto it = map_categ_wregex.find(categ.get_subcategory());
|
||||||
assert (it != map_categ_wregex.end());
|
GGML_ASSERT(it != map_categ_wregex.end());
|
||||||
if (it != map_categ_wregex.end()) {
|
if (it != map_categ_wregex.end()) {
|
||||||
if (categ.is_whitespace()) { // inside square brackets //NOTE: reusing flag WHITESPACE
|
if (categ.is_whitespace()) { // inside square brackets //NOTE: reusing flag WHITESPACE
|
||||||
assert (is_negated == false);
|
GGML_ASSERT(is_negated == false);
|
||||||
wregex_collapsed += it->second;
|
wregex_collapsed += it->second;
|
||||||
} else if(it->second.size() == 1 && !is_negated) {
|
} else if(it->second.size() == 1 && !is_negated) {
|
||||||
wregex_collapsed += it->second;
|
wregex_collapsed += it->second;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue