From b565148cb43b732327a4c515b6b484d55dc53f9b Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Tue, 13 Aug 2024 16:42:33 +0200 Subject: [PATCH] Update codepoint_categ: - Reorganize category/subcategory bits. - Regex flags for \s \w \d. --- src/unicode.cpp | 22 +++--- src/unicode.h | 176 ++++++++++++++++++++++++------------------------ 2 files changed, 96 insertions(+), 102 deletions(-) diff --git a/src/unicode.cpp b/src/unicode.cpp index 4a5728ed6..20c1287c4 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -209,7 +209,7 @@ static std::vector unicode_regex_split_custom_gpt2(const std::string & t return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE; }; - static const codepoint_categ SENTINEL = codepoint_categ::MASK + 1; + static const codepoint_categ SENTINEL = codepoint_categ::UNDEF + 1; auto _get_categ = [&] (const size_t pos) -> codepoint_categ { return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_category(cpts[pos]) : SENTINEL; }; @@ -328,7 +328,7 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE; }; - static const codepoint_categ SENTINEL = codepoint_categ::MASK + 1; + static const codepoint_categ SENTINEL = codepoint_categ::UNDEF + 1; auto _get_categ = [&] (const size_t pos) -> codepoint_categ { return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_category(cpts[pos]) : SENTINEL; }; @@ -589,28 +589,24 @@ codepoint_categ unicode_cpt_category(const uint32_t cp) { for (uint16_t rle : unicode_rle_codepoints_categs) { const uint32_t index = rle & 31; const uint32_t count = rle >> 5; - const auto categ = codepoint_categ::from_index(index); - //printf( "Codepoints 0x%05X to 0x%05X categ %s\n", cpt, cpt + count, categ.c_str()); + auto categ = codepoint_categ::from_index(index); + //printf("Codepoints 0x%05X to 0x%05X categ %s\n", cpt, cpt + count, categ.c_str()); + categ.set_flag(codepoint_categ::DIGITS, categ.is_Nd()); // \d --> \p{Nd} + categ.set_flag(codepoint_categ::WORDS, categ.is_L() | categ.is_N()); // \w --> \p{L} \p{N} _ for (uint32_t i = 0; i <= count; ++i) { cpt_categs[cpt++] = categ; } } GGML_ASSERT(cpt == MAX_CODEPOINTS); + cpt_categs['_'].set_flag(codepoint_categ::WORDS); // \w --> \p{L} \p{N} _ + for (auto p : unicode_ranges_whitespace) { for (uint32_t cpt = p.first; cpt <= p.second; ++cpt) { - cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACE); + cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACES); } } - for (auto p : unicode_map_lowercase) { - cpt_categs[p.second].set_flag(codepoint_categ::LOWERCASE); - } - - for (auto p : unicode_map_uppercase) { - cpt_categs[p.second].set_flag(codepoint_categ::UPPERCASE); - } - //for (auto &range : unicode_ranges_nfd) { // start, last, nfd // cpt_categs[cpt].set_flag(codepoint_categ::NORM_NFD); //} diff --git a/src/unicode.h b/src/unicode.h index 8a3f4078c..3aeb74771 100644 --- a/src/unicode.h +++ b/src/unicode.h @@ -9,74 +9,71 @@ #include struct codepoint_categ { + // 0bffffff'ccccccc'sss --> 6 bits flags + 7 bits category + 3 bits subcategory enum _category : uint16_t { - UNDEF = 0, // \p{Cn} Undefined - C = 1 << 0, // \p{C} Control - L = 1 << 1, // \p{L} Letter - M = 1 << 2, // \p{M} Mark - N = 1 << 3, // \p{N} Number - P = 1 << 4, // \p{P} Punctuation - S = 1 << 5, // \p{S} Symbol - Z = 1 << 6, // \p{Z} Separator - MASK = (1 << 7) - 1 // 7 bits - }; - - enum _subcategory : uint16_t { - Cc = C | (1 << 7), // \p{Cc} Control - Cf = C | (2 << 7), // \p{Cf} Format - Co = C | (3 << 7), // \p{Co} Private Use - Cs = C | (4 << 7), // \p{Cs} Surrrogate - Ll = L | (1 << 7), // \p{Ll} Lowercase Letter - Lm = L | (2 << 7), // \p{Lm} Modifier Letter - Lo = L | (3 << 7), // \p{Lo} Other Letter - Lt = L | (4 << 7), // \p{Lt} Titlecase Letter - Lu = L | (5 << 7), // \p{Lu} Uppercase Letter - Mc = M | (1 << 7), // \p{Mc} Spacing Mark - Me = M | (2 << 7), // \p{Me} Enclosing Mark - Mn = M | (3 << 7), // \p{Mn} Nonspacing Mark - Nd = N | (1 << 7), // \p{Nd} Decimal Number - Nl = N | (2 << 7), // \p{Nl} Letter Number - No = N | (3 << 7), // \p{No} Other Number - Pc = P | (1 << 7), // \p{Pc} Connector Punctuation - Pd = P | (2 << 7), // \p{Pd} Dash Punctuation - Pe = P | (3 << 7), // \p{Pe} Close Punctuation - Pf = P | (4 << 7), // \p{Pf} Final Punctuation - Pi = P | (5 << 7), // \p{Pi} Initial Punctuation - Po = P | (6 << 7), // \p{Po} Other Punctuation - Ps = P | (7 << 7), // \p{Ps} Open Punctuation - Sc = S | (1 << 7), // \p{Sc} Currency Symbol - Sk = S | (2 << 7), // \p{Sk} Modifier Symbol - Sm = S | (3 << 7), // \p{Sm} Math Symbol - So = S | (4 << 7), // \p{So} Other Symbol - Zl = Z | (1 << 7), // \p{Zl} Line Separator - Zp = Z | (2 << 7), // \p{Zp} Paragraph Separator - Zs = Z | (3 << 7), // \p{Zs} Space Separator - SUBMASK = (1 << 10) - 1 // 7+3 bits + UNDEF = 0, // \p{Cn} Undefined + C = 1 << (0 + 3), // \p{C} Control + L = 1 << (1 + 3), // \p{L} Letter + M = 1 << (2 + 3), // \p{M} Mark + N = 1 << (3 + 3), // \p{N} Number + P = 1 << (4 + 3), // \p{P} Punctuation + S = 1 << (5 + 3), // \p{S} Symbol + Z = 1 << (6 + 3), // \p{Z} Separator + Cc = C | 1, // \p{Cc} Control + Cf = C | 2, // \p{Cf} Format + Co = C | 3, // \p{Co} Private Use + Cs = C | 4, // \p{Cs} Surrrogate + Ll = L | 1, // \p{Ll} Lowercase Letter + Lm = L | 2, // \p{Lm} Modifier Letter + Lo = L | 3, // \p{Lo} Other Letter + Lt = L | 4, // \p{Lt} Titlecase Letter + Lu = L | 5, // \p{Lu} Uppercase Letter + Mc = M | 1, // \p{Mc} Spacing Mark + Me = M | 2, // \p{Me} Enclosing Mark + Mn = M | 3, // \p{Mn} Nonspacing Mark + Nd = N | 1, // \p{Nd} Decimal Number + Nl = N | 2, // \p{Nl} Letter Number + No = N | 3, // \p{No} Other Number + Pc = P | 1, // \p{Pc} Connector Punctuation + Pd = P | 2, // \p{Pd} Dash Punctuation + Pe = P | 3, // \p{Pe} Close Punctuation + Pf = P | 4, // \p{Pf} Final Punctuation + Pi = P | 5, // \p{Pi} Initial Punctuation + Po = P | 6, // \p{Po} Other Punctuation + Ps = P | 7, // \p{Ps} Open Punctuation + Sc = S | 1, // \p{Sc} Currency Symbol + Sk = S | 2, // \p{Sk} Modifier Symbol + Sm = S | 3, // \p{Sm} Math Symbol + So = S | 4, // \p{So} Other Symbol + Zl = Z | 1, // \p{Zl} Line Separator + Zp = Z | 2, // \p{Zp} Paragraph Separator + Zs = Z | 3, // \p{Zs} Space Separator + SUBMASK = (1 << 3) - 1, // 3 bits 0b000000'0000000'111 + MASK = (1 << 10) - 1, // 7+3 bits 0b000000'1111111'111 }; enum _flags : uint16_t { - WHITESPACE = (1 << 10), // regex: \s - LOWERCASE = (1 << 11), - UPPERCASE = (1 << 12), + WHITESPACES = (1 << 10), // regex: \s + WORDS = (1 << 11), // regex: \w + DIGITS = (1 << 12), // regex: \d //Norm NFD/NFC = ..., }; inline codepoint_categ(const uint16_t categ=0) : encoded{categ} {} inline void set_flag(_flags flags, bool value = true) { - flags = (_flags) (flags & ~SUBMASK); // ignore category bits + flags = (_flags) (flags & ~MASK); // do not modify category bits encoded = value ? (encoded | flags) : (encoded & ~flags); } inline uint16_t get_category() const { return encoded & MASK; } - inline uint16_t get_subcategory() const { return encoded & SUBMASK; } inline bool is_undefined() const { return !encoded; } inline bool is_defined() const { return encoded; } - inline uint16_t is_whitespace() const { return encoded & WHITESPACE; } - inline uint16_t is_lowercase() const { return encoded & LOWERCASE; } - inline uint16_t is_uppercase() const { return encoded & UPPERCASE; } + inline uint16_t is_whitespace() const { return encoded & WHITESPACES; } + inline uint16_t is_word() const { return encoded & WORDS; } + inline uint16_t is_digit() const { return encoded & DIGITS; } inline uint16_t is_C() const { return encoded & C; } inline uint16_t is_L() const { return encoded & L; } @@ -86,35 +83,35 @@ struct codepoint_categ { inline uint16_t is_S() const { return encoded & S; } inline uint16_t is_Z() const { return encoded & Z; } - inline bool is_Cc() const { return (encoded & SUBMASK) == Cc; } - inline bool is_Cf() const { return (encoded & SUBMASK) == Cf; } - inline bool is_Co() const { return (encoded & SUBMASK) == Co; } - inline bool is_Cs() const { return (encoded & SUBMASK) == Cs; } - inline bool is_Ll() const { return (encoded & SUBMASK) == Ll; } - inline bool is_Lm() const { return (encoded & SUBMASK) == Lm; } - inline bool is_Lo() const { return (encoded & SUBMASK) == Lo; } - inline bool is_Lt() const { return (encoded & SUBMASK) == Lt; } - inline bool is_Lu() const { return (encoded & SUBMASK) == Lu; } - inline bool is_Mc() const { return (encoded & SUBMASK) == Mc; } - inline bool is_Me() const { return (encoded & SUBMASK) == Me; } - inline bool is_Mn() const { return (encoded & SUBMASK) == Mn; } - inline bool is_Nd() const { return (encoded & SUBMASK) == Nd; } - inline bool is_Nl() const { return (encoded & SUBMASK) == Nl; } - inline bool is_No() const { return (encoded & SUBMASK) == No; } - inline bool is_Pc() const { return (encoded & SUBMASK) == Pc; } - inline bool is_Pd() const { return (encoded & SUBMASK) == Pd; } - inline bool is_Pe() const { return (encoded & SUBMASK) == Pe; } - inline bool is_Pf() const { return (encoded & SUBMASK) == Pf; } - inline bool is_Pi() const { return (encoded & SUBMASK) == Pi; } - inline bool is_Po() const { return (encoded & SUBMASK) == Po; } - inline bool is_Ps() const { return (encoded & SUBMASK) == Ps; } - inline bool is_Sc() const { return (encoded & SUBMASK) == Sc; } - inline bool is_Sk() const { return (encoded & SUBMASK) == Sk; } - inline bool is_Sm() const { return (encoded & SUBMASK) == Sm; } - inline bool is_So() const { return (encoded & SUBMASK) == So; } - inline bool is_Zl() const { return (encoded & SUBMASK) == Zl; } - inline bool is_Zp() const { return (encoded & SUBMASK) == Zp; } - inline bool is_Zs() const { return (encoded & SUBMASK) == Zs; } + inline bool is_Cc() const { return (encoded & MASK) == Cc; } + inline bool is_Cf() const { return (encoded & MASK) == Cf; } + inline bool is_Co() const { return (encoded & MASK) == Co; } + inline bool is_Cs() const { return (encoded & MASK) == Cs; } + inline bool is_Ll() const { return (encoded & MASK) == Ll; } + inline bool is_Lm() const { return (encoded & MASK) == Lm; } + inline bool is_Lo() const { return (encoded & MASK) == Lo; } + inline bool is_Lt() const { return (encoded & MASK) == Lt; } + inline bool is_Lu() const { return (encoded & MASK) == Lu; } + inline bool is_Mc() const { return (encoded & MASK) == Mc; } + inline bool is_Me() const { return (encoded & MASK) == Me; } + inline bool is_Mn() const { return (encoded & MASK) == Mn; } + inline bool is_Nd() const { return (encoded & MASK) == Nd; } + inline bool is_Nl() const { return (encoded & MASK) == Nl; } + inline bool is_No() const { return (encoded & MASK) == No; } + inline bool is_Pc() const { return (encoded & MASK) == Pc; } + inline bool is_Pd() const { return (encoded & MASK) == Pd; } + inline bool is_Pe() const { return (encoded & MASK) == Pe; } + inline bool is_Pf() const { return (encoded & MASK) == Pf; } + inline bool is_Pi() const { return (encoded & MASK) == Pi; } + inline bool is_Po() const { return (encoded & MASK) == Po; } + inline bool is_Ps() const { return (encoded & MASK) == Ps; } + inline bool is_Sc() const { return (encoded & MASK) == Sc; } + inline bool is_Sk() const { return (encoded & MASK) == Sk; } + inline bool is_Sm() const { return (encoded & MASK) == Sm; } + inline bool is_So() const { return (encoded & MASK) == So; } + inline bool is_Zl() const { return (encoded & MASK) == Zl; } + inline bool is_Zp() const { return (encoded & MASK) == Zp; } + inline bool is_Zs() const { return (encoded & MASK) == Zs; } inline bool operator == (const codepoint_categ other) const { return encoded == other.encoded; @@ -132,7 +129,7 @@ struct codepoint_categ { {Pd, "Pd"}, {Pe, "Pe"}, {Pf, "Pf"}, {Pi, "Pi"}, {Po, "Po"}, {Ps, "Ps"}, {Sc, "Sc"}, {Sk, "Sk"}, {Sm, "Sm"}, {So, "So"}, {Zl, "Zl"}, {Zp, "Zp"}, {Zs, "Zs"}, }; - const auto it = map.find(encoded & SUBMASK); + const auto it = map.find(encoded & MASK); return it == map.end() ? "INVALID" : it->second; } @@ -149,18 +146,19 @@ struct codepoint_categ { return 0; } const char * p = strchr(subcategs, subcateg); - return (uint16_t) (p ? (p - subcategs + 1) : 0); + GGML_ASSERT(p); + return (uint16_t) (p - subcategs + 1); }; switch(categ) { case 'C': if(subcateg == 'n') return 0; // undefined - return C | (_subindex(subcateg, "cfos" ) << 7); - case 'L': return L | (_subindex(subcateg, "lmotu" ) << 7); - case 'M': return M | (_subindex(subcateg, "cen" ) << 7); - case 'N': return N | (_subindex(subcateg, "dlo" ) << 7); - case 'P': return P | (_subindex(subcateg, "cdefios") << 7); - case 'S': return S | (_subindex(subcateg, "ckmo" ) << 7); - case 'Z': return Z | (_subindex(subcateg, "lps" ) << 7); - default: assert (false); return 0; + return C | _subindex(subcateg, "cfos" ); + case 'L': return L | _subindex(subcateg, "lmotu" ); + case 'M': return M | _subindex(subcateg, "cen" ); + case 'N': return N | _subindex(subcateg, "dlo" ); + case 'P': return P | _subindex(subcateg, "cdefios"); + case 'S': return S | _subindex(subcateg, "ckmo" ); + case 'Z': return Z | _subindex(subcateg, "lps" ); + default: GGML_ABORT("invalid category character"); } }