diff --git a/unicode.cpp b/unicode.cpp index 1c3c2b2c6..233dae564 100644 --- a/unicode.cpp +++ b/unicode.cpp @@ -446,13 +446,12 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & } // regex: ?[^\s\p{L}\p{N}]+[\r\n]* - char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt); int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type); if (cpt2_type != CODEPOINT_TYPE_WHITESPACE && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_DIGIT && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) { pos += (cpt == ' '); while(cpt2_type != CODEPOINT_TYPE_WHITESPACE && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_DIGIT && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) cpt2_type = _get_cpt_type(++pos); - cpt2 = _get_cpt(pos); + char32_t cpt2 = _get_cpt(pos); while(cpt2 == '\r' || cpt2 == '\n') cpt2 = _get_cpt(++pos); _add_token(pos); @@ -462,7 +461,7 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & size_t num_whitespaces = 0; size_t last_end_r_or_n = 0; while (_get_cpt_type(pos+num_whitespaces) == CODEPOINT_TYPE_WHITESPACE) { - cpt2 = _get_cpt(pos+num_whitespaces); + char32_t cpt2 = _get_cpt(pos+num_whitespaces); if (cpt2 == '\r' || cpt2 == '\n') last_end_r_or_n = pos + num_whitespaces + 1; num_whitespaces++;