From fe3c53191525e6244f6a6364afd137372a636665 Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Sat, 25 May 2024 02:10:08 +0200 Subject: [PATCH] bugfix: custom regex split fails with codepoint 0 --- unicode.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/unicode.cpp b/unicode.cpp index 056a4c741..2d638613f 100644 --- a/unicode.cpp +++ b/unicode.cpp @@ -226,8 +226,9 @@ static std::vector unicode_regex_split_custom_gpt2(const std::string & t assert(offset_end <= cpts.size()); start = offset_end; + static const char32_t OUT_OF_RANGE = 0xFFFFFFFF; auto _get_cpt = [&] (const size_t pos) -> char32_t { - return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0; + return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE; }; auto _get_flags = [&] (const size_t pos) -> codepoint_flags { @@ -309,7 +310,7 @@ static std::vector unicode_regex_split_custom_gpt2(const std::string & t } // regex: \s+(?!\S) - if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) { + if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) { pos += num_whitespaces - 1; _add_token(pos); continue; @@ -344,8 +345,9 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & assert(offset_end <= cpts.size()); start = offset_end; + static const char32_t OUT_OF_RANGE = 0xFFFFFFFF; auto _get_cpt = [&] (const size_t pos) -> char32_t { - return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0; + return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE; }; auto _get_flags = [&] (const size_t pos) -> codepoint_flags { @@ -450,7 +452,7 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & } // regex: \s+(?!\S) - if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) { + if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) { pos += num_whitespaces - 1; _add_token(pos); continue;