bugfix: custom regex split fails with codepoint 0
This commit is contained in:
parent
55e387b2d5
commit
fe3c531915
1 changed files with 6 additions and 4 deletions
10
unicode.cpp
10
unicode.cpp
|
@ -226,8 +226,9 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
||||||
assert(offset_end <= cpts.size());
|
assert(offset_end <= cpts.size());
|
||||||
start = offset_end;
|
start = offset_end;
|
||||||
|
|
||||||
|
static const char32_t OUT_OF_RANGE = 0xFFFFFFFF;
|
||||||
auto _get_cpt = [&] (const size_t pos) -> char32_t {
|
auto _get_cpt = [&] (const size_t pos) -> char32_t {
|
||||||
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
|
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
|
||||||
};
|
};
|
||||||
|
|
||||||
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
|
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
|
||||||
|
@ -309,7 +310,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
||||||
}
|
}
|
||||||
|
|
||||||
// regex: \s+(?!\S)
|
// regex: \s+(?!\S)
|
||||||
if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
|
if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) {
|
||||||
pos += num_whitespaces - 1;
|
pos += num_whitespaces - 1;
|
||||||
_add_token(pos);
|
_add_token(pos);
|
||||||
continue;
|
continue;
|
||||||
|
@ -344,8 +345,9 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
||||||
assert(offset_end <= cpts.size());
|
assert(offset_end <= cpts.size());
|
||||||
start = offset_end;
|
start = offset_end;
|
||||||
|
|
||||||
|
static const char32_t OUT_OF_RANGE = 0xFFFFFFFF;
|
||||||
auto _get_cpt = [&] (const size_t pos) -> char32_t {
|
auto _get_cpt = [&] (const size_t pos) -> char32_t {
|
||||||
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
|
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
|
||||||
};
|
};
|
||||||
|
|
||||||
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
|
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
|
||||||
|
@ -450,7 +452,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
||||||
}
|
}
|
||||||
|
|
||||||
// regex: \s+(?!\S)
|
// regex: \s+(?!\S)
|
||||||
if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
|
if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) {
|
||||||
pos += num_whitespaces - 1;
|
pos += num_whitespaces - 1;
|
||||||
_add_token(pos);
|
_add_token(pos);
|
||||||
continue;
|
continue;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue