This commit is contained in:
jaime-m-p 2024-04-29 00:12:56 +02:00
parent 6e4d2af6c3
commit 2a48873914

View file

@ -388,13 +388,13 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
size_t _prev_end = offset_ini; size_t _prev_end = offset_ini;
auto _add_token = [&] (const size_t end) -> size_t { auto _add_token = [&] (const size_t end) -> size_t {
assert(_prev_end <= end && end <= offset_end); assert(_prev_end <= end && end <= offset_end);
int len = end - _prev_end; size_t len = end - _prev_end;
if(len > 0) if(len > 0)
bpe_offsets.push_back(len); bpe_offsets.push_back(len);
_prev_end = end; _prev_end = end;
//if(len) { //if(len) {
// std::string s = ""; // std::string s = "";
// for(int p = end-len; p < end; p++) // for(size_t p = end-len; p < end; p++)
// s += unicode_cpt_to_utf8(cpts[p]); // s += unicode_cpt_to_utf8(cpts[p]);
// printf(">>> '%s'\n", s.c_str()); // printf(">>> '%s'\n", s.c_str());
//} //}
@ -435,7 +435,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
// regex: \p{N}{1,3} // regex: \p{N}{1,3}
if (cpt_type == CODEPOINT_TYPE_DIGIT) { if (cpt_type == CODEPOINT_TYPE_DIGIT) {
int ini = pos; size_t ini = pos;
while(_get_cpt_type(pos) == CODEPOINT_TYPE_DIGIT) { while(_get_cpt_type(pos) == CODEPOINT_TYPE_DIGIT) {
if (++pos - ini >= 3 ) { if (++pos - ini >= 3 ) {
_add_token(pos); _add_token(pos);
@ -460,18 +460,18 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
continue; continue;
} }
int num_whitespaces = 0; size_t num_whitespaces = 0;
int last_pos_r_or_n = -1; size_t last_end_r_or_n = 0;
while (_get_cpt_type(pos+num_whitespaces) == CODEPOINT_TYPE_WHITESPACE) { while (_get_cpt_type(pos+num_whitespaces) == CODEPOINT_TYPE_WHITESPACE) {
cpt2 = _get_cpt(pos+num_whitespaces); cpt2 = _get_cpt(pos+num_whitespaces);
if (cpt2 == '\r' || cpt2 == '\n') if (cpt2 == '\r' || cpt2 == '\n')
last_pos_r_or_n = pos+num_whitespaces; last_end_r_or_n = pos + num_whitespaces + 1;
num_whitespaces++; num_whitespaces++;
} }
// regex: \s*[\r\n]+ // regex: \s*[\r\n]+
if (last_pos_r_or_n >= 0) { if (last_end_r_or_n > 0) {
pos = last_pos_r_or_n + 1; pos = last_end_r_or_n;
_add_token(pos); _add_token(pos);
continue; continue;
} }