Reimplement unicode_regex_split():

- Using std::basic_regex. - Custom std::ctype specialization for 32bits codepoints. - Custom std::regex_traits specialization for 32bits codepoints. - Implementing custom 'character class expression' for \p{Xx}. - Single pass regex preparation.
2024-08-13 17:38:46 +02:00 · 2024-08-13 17:38:46 +02:00 · 5a93d2ec50
commit 5a93d2ec50
parent b565148cb4
2 changed files with 279 additions and 335 deletions
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@ -451,66 +451,6 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
    return bpe_offsets;
 }
 // use std::wregex to split the text
 static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector<size_t> & offsets) {
    std::wregex expr(regex_expr);
    std::vector<size_t> bpe_offsets; // store the offset of each word
    bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
    size_t start = 0;
    for (auto offset : offsets) {
        std::wcregex_iterator it(wtext.data() + start, wtext.data() + start + offset, expr);
        std::wcregex_iterator end;
        int64_t start_idx = 0;
        while (it != end) {
            std::wcmatch match = *it;
            if (match.position() > start_idx) {
                bpe_offsets.emplace_back(match.position() - start_idx);
            }
            bpe_offsets.emplace_back(match.length());
            start_idx = match.position() + match.length();
            ++it;
        }
        if (start_idx < (int64_t) offset) {
            bpe_offsets.emplace_back(offset - start_idx);
        }
        start += offset;
    }
    return bpe_offsets;
 }
 // use std::regex to split the text
 static std::vector<size_t> unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
    std::regex expr(regex_expr);
    std::vector<size_t> bpe_offsets; // store the offset of each word
    bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
    size_t start = 0;
    for (auto offset : offsets) {
        std::cregex_iterator it(text.data() + start, text.data() + start + offset, expr);
        std::cregex_iterator end;
        int64_t start_idx = 0;
        while (it != end) {
            std::cmatch match = *it;
            if (match.position() > start_idx) {
                bpe_offsets.emplace_back(match.position() - start_idx);
            }
            bpe_offsets.emplace_back(match.length());
            start_idx = match.position() + match.length();
            ++it;
        }
        if (start_idx < (int64_t) offset) {
            bpe_offsets.emplace_back(offset - start_idx);
        }
        start += offset;
    }
    return bpe_offsets;
 }
 static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
    std::vector<size_t> bpe_offsets;
@ -526,6 +466,261 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
    return bpe_offsets;
 }
 // Custom std::regex specializations for 32bit unicode codepoints
 //   std::wregex does not support unicode categories: \p{N}, \p{L}, \p{Lu}, \p{Ll} ...
 //   std::wregex does not support unicode whitespaces \s: 0x85, 0xA0, 0x001680 ... 0x003000.
 //   std::wregex supports full 32 bit codepoints, not limited to standard max 0x110000.
 namespace std {
    using codepoint = uint32_t;  // codepoint type for all template specializations
    // Minimal required implementation for std::regex string processing
    template<>  // custom specialized std::ctype<codepoint>
    class ctype<codepoint> {
        public:
        using CharT = codepoint;
        using char_type = CharT;
        using mask = uint8_t;          //NOTE: see std::ctype_base
        static const mask digit  = 1;  // requiered variable names
        static const mask xdigit = 2;  // user defined values
        static const mask alpha  = 3;  // used to be a bitmask
        static const mask upper  = 4;  // we do not need a bitmask
        static const mask lower  = 5;  // using a sequence instead
        static locale::id id;  // required by std::locale::facet
        bool is(mask m, char_type c) const {
            switch (m) {
                case digit:  return ('0' <= c && c <= '9');
                case xdigit: return ('0' <= c && c <= '9') || ('A' <= c && c <= 'F');
                case alpha:  return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z');
                case upper:  return ('A' <= c && c <= 'Z');
                case lower:  return ('a' <= c && c <= 'z');
                default:     return false;
            }
        }
        char_type toupper(char_type c) const {
            return ('a' <= c && c <= 'z') ? c - ('a' - 'A') : c;
        }
        char_type tolower(char_type c) const {
            return ('A' <= c && c <= 'Z') ? c + ('a' - 'A') : c;
        }
        char_type widen(char c) const {  // char to codepoint
            return (char_type) c;
        }
        char narrow(char_type c, char dfault) const {  // codepoint to char
            return (c < 0x80 ? (char)c : dfault);
        }
    };
    locale::id ctype<codepoint>::id = {};
    template<>  // specialization to use our custom specialized std::ctype<codepoint>
    const std::ctype<codepoint> & use_facet<std::ctype<codepoint>>(const std::locale &) {
        static std::ctype<codepoint> ctype_uint32 = {};
        return ctype_uint32;
    }
    template<>  // specialization to use our custom specialized std::ctype<codepoint>
    const std::ctype<codepoint> & use_facet<const std::ctype<codepoint>>(const std::locale & loc) {
        return use_facet<std::ctype<codepoint>>(loc);
    }
    // Minimal required implementation for std::regex string processing
    template<>  // custom specialized std::regex_traits<codepoint>
    class regex_traits<codepoint> {
    public:
        using CharT       = codepoint;
        using char_type   = codepoint;
        using size_type   = size_t;
        using string_type = std::basic_string<CharT>;
        using locale_type = std::locale;
        using char_class_type = uint64_t;
        #if (defined(_WIN32) || defined(_WIN64))  // MSVC class _Regex_traits
            using _Uelem = CharT;
            static const auto _Ch_upper = std::ctype<CharT>::upper;
            static const auto _Ch_alpha = std::ctype<CharT>::alpha;
        #endif
        static size_type length(const CharT * str) {
            return std::char_traits<CharT>::length(str);
        }
        CharT translate(CharT c) const {
            return c;
        }
        CharT translate_nocase(CharT c) const {
            return unicode_tolower(c);
        }
        template<typename It>
        string_type transform(It first, It last) const {
            GGML_ASSERT(false);   //TODO: not needed ?
            return {first, last}; //TODO: not tested
        }
        template<typename It>
        string_type transform_primary(It first, It last) const {
            (void) first;
            (void) last;
            GGML_ASSERT(*first < MAX_CODEPOINTS);  // valid codepoint
            return {};
        }
        template<typename It>
        string_type lookup_collatename(It first, It last) const {
            (void) last;
            GGML_ASSERT(*first & (1 << 31));
            return {*first};
        }
        template<typename It>
        char_class_type lookup_classname(It first, It last, bool icase = false) const {
            (void) last;
            (void) icase;
            const uint32_t encoded = *first;
            codepoint_categ categ = {};
            switch(encoded) {
                case 's':
                case 'S':  // negation is internally tracked
                    categ.set_flag(codepoint_categ::WHITESPACES);
                    return categ.expand_bits();
                case 'w':
                case 'W':  // negation is internally tracked
                    categ.set_flag(codepoint_categ::WORDS);
                    return categ.expand_bits();
                case 'd':
                case 'D':  // negation is internally tracked
                    categ.set_flag(codepoint_categ::DIGITS);
                    return categ.expand_bits();
                default: {  // unicode category \p{Xx} encoded in codepoint
                    GGML_ASSERT(encoded & (1 << 31));  // make sure its our custom codepoint encoding the category
                    const bool negated = encoded & (1 << 30);  // negation of 'character class expression' are not internally tracked
                    categ = {(uint16_t) encoded};
                    return ((uint64_t) negated << 63) | categ.expand_bits(false);
                }
            }
        }
        bool isctype(CharT c, char_class_type mask) const {
            const bool negated = mask & (1llu << 63);
            mask &= unicode_cpt_category(c).expand_bits();
            return negated ^ (bool) mask;
        }
        int value(CharT c, int radix) const {  // char to int value
            switch (radix) {
                case 8:  return ('0' <= c && c <= '7') ? (int)c - '0' : -1;
                case 10: return ('0' <= c && c <= '9') ? (int)c - '0' : -1;
                case 16: return ('0' <= c && c <= '9') ? (int)c - '0' : (('A' <= c && c <= 'F') ? (int)c - 'A' + 10 : -1);
                default: return -1;
            }
        }
        const locale_type & imbue(const locale_type &) {  // set locale  //NOTE: ignoring locales
            return std::locale::classic();
        }
        const locale_type & getloc() const {  // get locale  //NOTE: ignoring locales
            return std::locale::classic();
        }
    };
 }
 static std::vector<uint32_t> unicode_regex_prepare(const std::string & regex) {
    std::vector<uint32_t> regex_cpts;
    regex_cpts.reserve(regex.size() * 12 / 10);  // estimate +20%
    size_t offset = 0;
    int inside_square = 0;
    bool any_positive = false;
    bool any_negative = false;
    const size_t size = regex.size();
    while (offset < size) {
        inside_square += regex[offset] == '[';
        inside_square -= regex[offset] == ']';
        GGML_ASSERT(inside_square >= 0);
        if (!inside_square) {
            any_positive = false;
            any_negative = false;
        }
        if (regex[offset] == '\\') {
            const size_t i = offset + 1;
            if (regex[i] == 'p' || regex[i] == 'P') {
                // convert \p{Xx} to custom 'character class expression' [:Xy:]
                if (regex[i + 1] == '{' && regex[i + 2] && regex[i + 3]) {
                    codepoint_categ categ = {};
                    if (regex[i + 3] == '}') {
                        categ = codepoint_categ::from_chars(regex[i + 2]);
                        offset += 5;
                    } else if (regex[i + 3] != '}' && regex[i + 4] == '}') {
                        categ = codepoint_categ::from_chars(regex[i + 2], regex[i + 3]);
                        offset += 6;
                    }
                    bool negated = regex[i] == 'P';
                    any_positive |= !negated;
                    any_negative |= negated;
                    GGML_ASSERT(any_positive != any_negative);  //BUG: can not mix 'p' and 'P' inside []
                    GGML_ASSERT(sizeof(categ) <= 2);
                    // encoded category in 32 bits codepoint
                    uint32_t cpt_categ = (1 << 31) | (negated << 30) | categ.encoded;
                    if (inside_square) {
                        regex_cpts.insert(regex_cpts.end(), {'[', ':', cpt_categ, ':', ']'});
                    } else {
                        regex_cpts.insert(regex_cpts.end(), {'[', '[', ':', cpt_categ, ':', ']', ']'});
                    }
                    continue;
                }
            }
        }
        regex_cpts.push_back(unicode_cpt_from_utf8(regex, offset));
    }
    return regex_cpts;
 }
 // use std::basic_regex<uint32_t> to split the text codepoints
 static std::vector<size_t> unicode_regex_split_stl(const std::vector<uint32_t> & text_cpts, const std::vector<uint32_t> & regex_cpts, const std::vector<size_t> & offsets) {
    using regex_type = std::basic_regex<uint32_t>;
    using iter_type = std::regex_iterator<const uint32_t *>;
    regex_type regex(regex_cpts.begin(), regex_cpts.end());
    const iter_type end;
    std::vector<size_t> bpe_offsets; // store the offset of each word
    bpe_offsets.reserve(offsets.size()); // reserve memory for the approximate size
    const uint32_t * text_data = text_cpts.data();
    for (auto offset : offsets) {
        iter_type it(text_data, text_data + offset, regex);
        int64_t start_idx = 0;
        while (it != end) {
            if (it->position() > start_idx) {
                bpe_offsets.emplace_back(it->position() - start_idx);
            }
            bpe_offsets.emplace_back(it->length());
            start_idx = it->position() + it->length();
            ++it;
        }
        if (start_idx < (int64_t) offset) {
            bpe_offsets.emplace_back(offset - start_idx);
        }
        text_data += offset;
    }
    return bpe_offsets;
 }
 //
 // interface
 //
@ -639,288 +834,21 @@ uint32_t unicode_tolower(uint32_t cp) {
    return it == unicode_map_lowercase.end() ? cp : it->second;
 }
-std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
+std::vector<std::string> unicode_regex_split(const std::string & text_utf8, const std::vector<std::string> & regex_exprs) {
-    // std::wregex does not support unicode categories: \p{N}, \p{L}, \p{Lu}, \p{Ll} ...
+    const std::vector<uint32_t> cpts = unicode_cpts_from_utf8(text_utf8);
-    // std::wregex does not support unicode whitespaces \s: 0x85, 0xA0, 0x001680 ... 0x003000.
+    std::vector<size_t> offsets = { cpts.size() };
    // std::wregex allows full wchar_t 32 bit codepoints, not limited to standard max 0x110000.
    // The main idea is to insert unicode category bits into all regex and text codepoints.
    //   Max unicode codepoint 0x110000 fits in 21 bits.
    //   Store unicode category and subcategory in 10 bits.
    //   Set the high bit to zero to keep wchar_t positive (uint32_t codepoints).
    //   Categorized codepoint:
    //     1 bit zero + 7 bits category + 3 bits subcategory index + 21 bits codepoint
    //     0b0'XXXXXXX'xxx'ccccccccccccccccccccc
    // A "categorized codepoint" re-defines the ordering keeping category hierarchy.
    //   All high category codepoints \p{X} fall into the range:
    //     0b0'XXXXXXX'000'000000000000000000000
    //     0b0'XXXXXXX'111'111111111111111111111
    //   All subcategory codepoints \p{Xx} fall into the range:
    //     0b0'XXXXXXX'xxx'000000000000000000000
    //     0b0'XXXXXXX'xxx'111111111111111111111
    // Processing steps:
    //   Build a lists of "categorized codepoints/ranges" for replacing regex \s \w and \d.
    //   Replace all regex codepoints/ranges with respective "categorized codepoints/ranges".
    //   Replace all text codepoints with respective "categorized codepoints".
    // Caveats:
    //   Some regex ranges starts and ends with different category/subcategory.
    //   Split the ranges in sub-ranges to ensure a single category to maintain the new hierarchy.
    //   This forces iterating all ranges and could produce long sub-range sequences.
    //TODO: Regex processing can be cached.
    // insert unicode category and subcategory before codepoint bits
    // 1 bit zero + 7 bits category + 3 bits subcategory index + 21 bits zero
    static const auto categorized_prefix = [] (const codepoint_categ categ) -> wchar_t {
        static const uint32_t MASK    = codepoint_categ::MASK;  // category mask
        static const uint32_t SUBMASK = codepoint_categ::SUBMASK & ~codepoint_categ::MASK;  // subcategory mask
        return (wchar_t) (((categ.encoded & MASK) << (21+3)) | ((categ.encoded & SUBMASK) << (21-7)));
    };
    // insert unicode category and subcategory before codepoint bits
    // 1 bit zero + 7 bits category + 3 bits subcategory index + 21 bits codepoint
    static const auto categorize_codepoint = [] (const uint32_t cpt) -> wchar_t {
        GGML_ASSERT(cpt < (1 << 21));
        return categorized_prefix(unicode_cpt_category(cpt)) | (wchar_t)cpt;
    };
    // remove the categorized prefix bits and restore original codepoint bits
    static const auto decategorize_codepoint = [] (const wchar_t cpt) -> uint32_t {
        return (uint32_t) cpt & ((1 << 21) - 1);
    };
    // returns the respective categorized codepoint range of the category/subcategory
    static const auto categorize_range_from_chars = [] (const char categ, const char subcateg) {
        const wchar_t range_ini = categorized_prefix(codepoint_categ::from_chars(categ, subcateg));
        const wchar_t range_end = (wchar_t) (range_ini | (subcateg ? (1<<21)-1 : (1<<24)-1));
        return std::pair<wchar_t, wchar_t>(range_ini, range_end);
    };
    // helper function to append/concat regex expressions
    auto wregex_append_subregex = [] (std::wstring & wregex, const std::wstring & subregex, const bool add_squares, const bool negated) {
        if (add_squares) {
            wregex += '[';
            if (negated) {
                wregex += '^';
            }
            wregex += subregex;
            wregex += ']';
        } else {
            GGML_ASSERT(!negated);  //TODO: negation inside square brackets: \S \W \D
            wregex += subregex;
        }
    };
    // \d digits replacement
    static const std::wstring wregex_digits = {
        categorize_codepoint('0'), '-', categorize_codepoint('9'),
    };
    // \w words replacement
    static const std::wstring wregex_words = {
        categorize_codepoint('_'),
        categorize_codepoint('0'), '-', categorize_codepoint('9'),
        categorize_codepoint('A'), '-', categorize_codepoint('Z'),
        categorize_codepoint('a'), '-', categorize_codepoint('z'),
    };
    // \s whitespaces replacement
    static const std::wstring wregex_whitespaces = [] {
        std::wstring wregex_whitespaces;
        for (const auto & range : unicode_ranges_whitespace) {
            wregex_whitespaces += categorize_codepoint(range.first);
            if (range.second > range.first) {
                wregex_whitespaces += '-';
                wregex_whitespaces += categorize_codepoint(range.second);
            }
        }
        return wregex_whitespaces;
    }();
    GGML_ASSERT(sizeof(wchar_t) == sizeof(uint32_t));
    std::wstring wtext = unicode_wstring_from_utf8(text);
    std::vector<size_t> offsets = { wtext.size() };
    for (auto & regex_expr : regex_exprs) {
        // first, see if we have an efficient custom regex implementation
-        auto tmp = unicode_regex_split_custom(text, regex_expr, offsets);
+        auto tmp = unicode_regex_split_custom(text_utf8, regex_expr, offsets);
        if (!tmp.empty()) {
            offsets = std::move(tmp);
            continue;
        }
-        std::wstring wregex;
+        const auto regex_cpts = unicode_regex_prepare(regex_expr);
-        bool inside_square = false;
+        offsets = unicode_regex_split_stl(cpts, regex_cpts, offsets);
        bool is_cpt_range  = false;
        const auto cpts_regex = unicode_cpts_from_utf8(regex_expr);
        wregex.reserve(2 * cpts_regex.size());
        for (size_t i = 0; i < cpts_regex.size(); ++i) {
            uint32_t cpt = cpts_regex[i];
            // parse regex metacharacters
            wregex += (wchar_t) cpt;
            if (inside_square) {
                switch(cpt) {
                    case '^':
                        if (cpts_regex[i - 1] != '[') {
                            break;
                        }
                        continue;
                    case ']':
                        inside_square = false;
                        continue;
                    case '-':
                        is_cpt_range = true;
                        continue;
                }
            } else {
                switch(cpt) {
                    case '^':
                        if (i > 0) {
                            break;
                        }
                        continue;
                    case '$':
                        if (i + 1 < cpts_regex.size()) {
                            break;
                        }
                        continue;
                    case '[':
                        inside_square = true;
                        continue;
                    case '{':
                        while (cpt && cpt != '}') {
                            cpt = cpts_regex[++i];
                            wregex += (wchar_t) cpt;
                        }
                        continue;
                    case '}':
                    case ']':
                        GGML_ABORT("invalid regex");
                    case '(':
                        if (cpts_regex[i + 1] == '?') {  // (?: (?i: (?= (?! (?<= (?<!
                            if (cpts_regex[i + 2] == ':') {
                                wregex += (wchar_t) cpts_regex[++i];
                                wregex += (wchar_t) cpts_regex[++i];
                            } else if (cpts_regex[i + 2] == 'i') {
                                wregex += (wchar_t) cpts_regex[++i];
                                wregex += (wchar_t) cpts_regex[++i];
                                wregex += (wchar_t) cpts_regex[++i];
                                GGML_ASSERT(cpts_regex[i] == ':');
                            } else {
                                wregex += (wchar_t) cpts_regex[++i];
                                wregex += (wchar_t) cpts_regex[++i];
                                if (cpts_regex[i] == '<') {
                                    wregex += (wchar_t) cpts_regex[++i];
                                }
                                GGML_ASSERT(cpts_regex[i] == '=' || cpts_regex[i] == '!');
                            }
                        }
                        continue;
                    case ')':
                    case '|':
                    case '.':
                    case '?':
                    case '+':
                    case '*':
                        continue;
                }
            }
            wregex.pop_back();
            // parse unicode categories and subcategories, replace category with the categorized range
            if (cpt == '\\' && cpts_regex[i + 1] == 'p' && cpts_regex[i + 2] == '{') {
                GGML_ASSERT(cpts_regex[i + 3] && cpts_regex[i + 4]);
                std::pair<wchar_t, wchar_t> range;
                if (cpts_regex[i + 4] == '}') {
                    range = categorize_range_from_chars((char)cpts_regex[i + 3], (char)'\0');
                    i += 4;
                } else {
                    range = categorize_range_from_chars((char)cpts_regex[i + 3], (char)cpts_regex[i + 4]);
                    i += 5;
                }
                GGML_ASSERT(cpts_regex[i] == '}');
                const std::wstring subregex = {range.first, '-', range.second};
                wregex_append_subregex(wregex, subregex, !inside_square, false);
                continue;
            }
            // parse more metcharacters and espaped characters
            if (cpt == '\\') {
                switch (cpts_regex[i + 1]) {
                    case 's':  // \s whitespaces
                    case 'S':  // \S no whitespaces
                        wregex_append_subregex(wregex, wregex_whitespaces, !inside_square, cpts_regex[++i] == 'S');
                        continue;
                    case 'w':  // \w words
                    case 'W':  // \W no words
                        wregex_append_subregex(wregex, wregex_words, !inside_square, cpts_regex[++i] == 'W');
                        continue;
                    case 'd':  // \d digits
                    case 'D':  // \D no digits
                        wregex_append_subregex(wregex, wregex_digits, !inside_square, cpts_regex[++i] == 'D');
                        continue;
                    case 't':  ++i;  cpt = '\t';  break;
                    case 'r':  ++i;  cpt = '\r';  break;
                    case 'n':  ++i;  cpt = '\n';  break;
                    case 'x':  GGML_ABORT("TODO");  //TODO: hex values
                    case 'u':  GGML_ABORT("TODO");  //TODO: unicode values
                    case 'U':  GGML_ABORT("TODO");  //TODO: unicode values
                    default:  // escaped character
                        GGML_ASSERT(!is_cpt_range);
                        cpt = cpts_regex[++i];
                        GGML_ASSERT(cpt < 0x80);
                        break;
                }
            }
            if (is_cpt_range) {
                // Some regex ranges starts and ends with different category/subcategory.
                // Split the ranges in sub-ranges to ensure a single category to maintain the new hierarchy.
                // Warning: This forces iterating all ranges and could produce long sub-range sequences.
                GGML_ASSERT(wregex.size() && wregex.back() == '-');
                wregex.pop_back();
                wchar_t categorized = wregex.back();
                uint32_t range_ini = decategorize_codepoint(categorized);
                const uint32_t range_end = cpt;
                GGML_ASSERT(range_ini <= range_end);
                codepoint_categ range_categ = unicode_cpt_category(range_ini);
                for (cpt = range_ini + 1; cpt <= range_end; ++cpt) {
                    codepoint_categ categ = unicode_cpt_category(cpt);
                    if (categ == range_categ) {  // still same range category ?
                        ++categorized;
                        if (cpt == range_ini + 1) {  // single step, no need range
                            wregex += categorized;
                        } else if (cpt == range_ini + 2) {  // need range if +2 step
                            wregex.back() = '-';
                            wregex += categorized;
                        } else {
                            wregex.back() = categorized;  // keep range growing
                        }
                    } else {  // new range category
                        categorized = categorize_codepoint(cpt);
                        wregex += categorized;
                        range_categ = categ;
                        range_ini = cpt;
                    }
                }
                is_cpt_range = false;
            } else {
                wregex += categorize_codepoint(cpt);
            }
        }
        // categorize all wtext codepoints
        if (wtext.size() && wtext[0] < MAX_CODEPOINTS) {  // if not already categorized
            for (size_t i = 0; i < wtext.size(); ++i) {
                wtext[i] = categorize_codepoint((uint32_t) wtext[i]);
            }
        }
        offsets = unicode_regex_split_stl(wtext, wregex, offsets);
    }
    std::vector<std::string> bpe_words;
@ -930,8 +858,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
    for (size_t & offset : offsets) {
        bpe_words.emplace_back();
        for (size_t i = start; i < start + offset; ++i) {
-            const uint32_t cpt = decategorize_codepoint(wtext[i]);
+            bpe_words.back() += unicode_cpt_to_utf8(cpts[i]);
            bpe_words.back() += unicode_cpt_to_utf8(cpt);
        }
        start += offset;
    }
--- a/src/unicode.h
+++ b/src/unicode.h
@ -113,6 +113,23 @@ struct codepoint_categ {
    inline bool is_Zp() const { return (encoded & MASK) == Zp; }
    inline bool is_Zs() const { return (encoded & MASK) == Zs; }
    inline uint64_t expand_bits(const bool add_categ=true) const {  // one bit for each category/subcateory and flags
        const uint32_t subindex = encoded & SUBMASK;
        const uint64_t bits = (encoded & MASK) >> 3;
        const uint64_t flags = encoded >> 10;
        return (flags << (7 * 8)) | (bits << (7 * subindex)) | (bits * add_categ);
    }
    inline bool is_in_range(const codepoint_categ other) const {  // this.first <= other <= this.last
        if (encoded & SUBMASK) {
            return encoded == other.encoded;  // no range
        }
        if (encoded & MASK) {
            return encoded == (other.encoded & ~SUBMASK);  // from 0bffffff'ccccccc'000 to 0bffffff'ccccccc'111
        }
        return encoded == (other.encoded & ~MASK);  // from 0bffffff'0000000'000 to 0bffffff'1111111'111
    }
    inline bool operator == (const codepoint_categ other) const {
        return encoded == other.encoded;
    }