From b565148cb43b732327a4c515b6b484d55dc53f9b Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Tue, 13 Aug 2024 16:42:33 +0200
Subject: [PATCH] Update codepoint_categ:

- Reorganize category/subcategory bits.
- Regex flags for \s \w \d.
---
 src/unicode.cpp |  22 +++---
 src/unicode.h   | 176 ++++++++++++++++++++++++------------------------
 2 files changed, 96 insertions(+), 102 deletions(-)
diff --git a/src/unicode.cpp b/src/unicode.cpp
index 4a5728ed6..20c1287c4 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -209,7 +209,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
             return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
         };
 
-        static const codepoint_categ SENTINEL = codepoint_categ::MASK + 1;
+        static const codepoint_categ SENTINEL = codepoint_categ::UNDEF + 1;
         auto _get_categ = [&] (const size_t pos) -> codepoint_categ {
             return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_category(cpts[pos]) : SENTINEL;
         };
@@ -328,7 +328,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
             return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
         };
 
-        static const codepoint_categ SENTINEL = codepoint_categ::MASK + 1;
+        static const codepoint_categ SENTINEL = codepoint_categ::UNDEF + 1;
         auto _get_categ = [&] (const size_t pos) -> codepoint_categ {
             return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_category(cpts[pos]) : SENTINEL;
         };
@@ -589,28 +589,24 @@ codepoint_categ unicode_cpt_category(const uint32_t cp) {
         for (uint16_t rle : unicode_rle_codepoints_categs) {
             const uint32_t index = rle & 31;
             const uint32_t count = rle >> 5;
-            const auto categ = codepoint_categ::from_index(index);
-            //printf( "Codepoints 0x%05X to 0x%05X categ %s\n", cpt, cpt + count, categ.c_str());
+            auto categ = codepoint_categ::from_index(index);
+            //printf("Codepoints 0x%05X to 0x%05X categ %s\n", cpt, cpt + count, categ.c_str());
+            categ.set_flag(codepoint_categ::DIGITS, categ.is_Nd());               // \d --> \p{Nd}
+            categ.set_flag(codepoint_categ::WORDS, categ.is_L() | categ.is_N());  // \w --> \p{L} \p{N} _
             for (uint32_t i = 0; i <= count; ++i) {
                 cpt_categs[cpt++] = categ;
             }
         }
         GGML_ASSERT(cpt == MAX_CODEPOINTS);
 
+        cpt_categs['_'].set_flag(codepoint_categ::WORDS);  // \w --> \p{L} \p{N} _
+
         for (auto p : unicode_ranges_whitespace) {
             for (uint32_t cpt = p.first; cpt <= p.second; ++cpt) {
-                cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACE);
+                cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACES);
             }
         }
 
-        for (auto p : unicode_map_lowercase) {
-            cpt_categs[p.second].set_flag(codepoint_categ::LOWERCASE);
-        }
-
-        for (auto p : unicode_map_uppercase) {
-            cpt_categs[p.second].set_flag(codepoint_categ::UPPERCASE);
-        }
-
         //for (auto &range : unicode_ranges_nfd) {  // start, last, nfd
         //    cpt_categs[cpt].set_flag(codepoint_categ::NORM_NFD);
         //}
diff --git a/src/unicode.h b/src/unicode.h
index 8a3f4078c..3aeb74771 100644
--- a/src/unicode.h
+++ b/src/unicode.h
@@ -9,74 +9,71 @@
 #include <map>
 
 struct codepoint_categ {
+    // 0bffffff'ccccccc'sss --> 6 bits flags + 7 bits category + 3 bits subcategory
     enum _category : uint16_t {
-        UNDEF = 0,   // \p{Cn} Undefined
-        C = 1 << 0,  // \p{C}  Control
-        L = 1 << 1,  // \p{L}  Letter
-        M = 1 << 2,  // \p{M}  Mark
-        N = 1 << 3,  // \p{N}  Number
-        P = 1 << 4,  // \p{P}  Punctuation
-        S = 1 << 5,  // \p{S}  Symbol
-        Z = 1 << 6,  // \p{Z}  Separator
-        MASK = (1 << 7) - 1  // 7 bits
-    };
-
-    enum _subcategory : uint16_t {
-        Cc = C | (1 << 7),  // \p{Cc} Control
-        Cf = C | (2 << 7),  // \p{Cf} Format
-        Co = C | (3 << 7),  // \p{Co} Private Use
-        Cs = C | (4 << 7),  // \p{Cs} Surrrogate
-        Ll = L | (1 << 7),  // \p{Ll} Lowercase Letter
-        Lm = L | (2 << 7),  // \p{Lm} Modifier Letter
-        Lo = L | (3 << 7),  // \p{Lo} Other Letter
-        Lt = L | (4 << 7),  // \p{Lt} Titlecase Letter
-        Lu = L | (5 << 7),  // \p{Lu} Uppercase Letter
-        Mc = M | (1 << 7),  // \p{Mc} Spacing Mark
-        Me = M | (2 << 7),  // \p{Me} Enclosing Mark
-        Mn = M | (3 << 7),  // \p{Mn} Nonspacing Mark
-        Nd = N | (1 << 7),  // \p{Nd} Decimal Number
-        Nl = N | (2 << 7),  // \p{Nl} Letter Number
-        No = N | (3 << 7),  // \p{No} Other Number
-        Pc = P | (1 << 7),  // \p{Pc} Connector Punctuation
-        Pd = P | (2 << 7),  // \p{Pd} Dash Punctuation
-        Pe = P | (3 << 7),  // \p{Pe} Close Punctuation
-        Pf = P | (4 << 7),  // \p{Pf} Final Punctuation
-        Pi = P | (5 << 7),  // \p{Pi} Initial Punctuation
-        Po = P | (6 << 7),  // \p{Po} Other Punctuation
-        Ps = P | (7 << 7),  // \p{Ps} Open Punctuation
-        Sc = S | (1 << 7),  // \p{Sc} Currency Symbol
-        Sk = S | (2 << 7),  // \p{Sk} Modifier Symbol
-        Sm = S | (3 << 7),  // \p{Sm} Math Symbol
-        So = S | (4 << 7),  // \p{So} Other Symbol
-        Zl = Z | (1 << 7),  // \p{Zl} Line Separator
-        Zp = Z | (2 << 7),  // \p{Zp} Paragraph Separator
-        Zs = Z | (3 << 7),  // \p{Zs} Space Separator
-        SUBMASK = (1 << 10) - 1  // 7+3 bits
+        UNDEF = 0,         // \p{Cn} Undefined
+        C = 1 << (0 + 3),  // \p{C}  Control
+        L = 1 << (1 + 3),  // \p{L}  Letter
+        M = 1 << (2 + 3),  // \p{M}  Mark
+        N = 1 << (3 + 3),  // \p{N}  Number
+        P = 1 << (4 + 3),  // \p{P}  Punctuation
+        S = 1 << (5 + 3),  // \p{S}  Symbol
+        Z = 1 << (6 + 3),  // \p{Z}  Separator
+        Cc = C | 1,  // \p{Cc} Control
+        Cf = C | 2,  // \p{Cf} Format
+        Co = C | 3,  // \p{Co} Private Use
+        Cs = C | 4,  // \p{Cs} Surrrogate
+        Ll = L | 1,  // \p{Ll} Lowercase Letter
+        Lm = L | 2,  // \p{Lm} Modifier Letter
+        Lo = L | 3,  // \p{Lo} Other Letter
+        Lt = L | 4,  // \p{Lt} Titlecase Letter
+        Lu = L | 5,  // \p{Lu} Uppercase Letter
+        Mc = M | 1,  // \p{Mc} Spacing Mark
+        Me = M | 2,  // \p{Me} Enclosing Mark
+        Mn = M | 3,  // \p{Mn} Nonspacing Mark
+        Nd = N | 1,  // \p{Nd} Decimal Number
+        Nl = N | 2,  // \p{Nl} Letter Number
+        No = N | 3,  // \p{No} Other Number
+        Pc = P | 1,  // \p{Pc} Connector Punctuation
+        Pd = P | 2,  // \p{Pd} Dash Punctuation
+        Pe = P | 3,  // \p{Pe} Close Punctuation
+        Pf = P | 4,  // \p{Pf} Final Punctuation
+        Pi = P | 5,  // \p{Pi} Initial Punctuation
+        Po = P | 6,  // \p{Po} Other Punctuation
+        Ps = P | 7,  // \p{Ps} Open Punctuation
+        Sc = S | 1,  // \p{Sc} Currency Symbol
+        Sk = S | 2,  // \p{Sk} Modifier Symbol
+        Sm = S | 3,  // \p{Sm} Math Symbol
+        So = S | 4,  // \p{So} Other Symbol
+        Zl = Z | 1,  // \p{Zl} Line Separator
+        Zp = Z | 2,  // \p{Zp} Paragraph Separator
+        Zs = Z | 3,  // \p{Zs} Space Separator
+        SUBMASK = (1 <<  3) - 1,  // 3 bits   0b000000'0000000'111
+        MASK    = (1 << 10) - 1,  // 7+3 bits 0b000000'1111111'111
     };
 
     enum _flags : uint16_t {
-        WHITESPACE = (1 << 10),  // regex: \s
-        LOWERCASE  = (1 << 11),
-        UPPERCASE  = (1 << 12),
+        WHITESPACES = (1 << 10),  // regex: \s
+        WORDS       = (1 << 11),  // regex: \w
+        DIGITS      = (1 << 12),  // regex: \d
         //Norm NFD/NFC  = ...,
     };
 
     inline codepoint_categ(const uint16_t categ=0) : encoded{categ} {}
 
     inline void set_flag(_flags flags, bool value = true) {
-        flags = (_flags) (flags & ~SUBMASK);  // ignore category bits
+        flags = (_flags) (flags & ~MASK);  // do not modify category bits
         encoded = value ? (encoded | flags) : (encoded & ~flags);
     }
 
     inline uint16_t get_category() const { return encoded & MASK; }
-    inline uint16_t get_subcategory() const { return encoded & SUBMASK; }
 
     inline bool is_undefined() const { return !encoded; }
     inline bool is_defined() const { return encoded; }
 
-    inline uint16_t is_whitespace() const { return encoded & WHITESPACE; }
-    inline uint16_t is_lowercase()  const { return encoded & LOWERCASE; }
-    inline uint16_t is_uppercase()  const { return encoded & UPPERCASE; }
+    inline uint16_t is_whitespace() const { return encoded & WHITESPACES; }
+    inline uint16_t is_word()       const { return encoded & WORDS;  }
+    inline uint16_t is_digit()      const { return encoded & DIGITS; }
 
     inline uint16_t is_C() const { return encoded & C; }
     inline uint16_t is_L() const { return encoded & L; }
@@ -86,35 +83,35 @@ struct codepoint_categ {
     inline uint16_t is_S() const { return encoded & S; }
     inline uint16_t is_Z() const { return encoded & Z; }
 
-    inline bool is_Cc() const { return (encoded & SUBMASK) == Cc; }
-    inline bool is_Cf() const { return (encoded & SUBMASK) == Cf; }
-    inline bool is_Co() const { return (encoded & SUBMASK) == Co; }
-    inline bool is_Cs() const { return (encoded & SUBMASK) == Cs; }
-    inline bool is_Ll() const { return (encoded & SUBMASK) == Ll; }
-    inline bool is_Lm() const { return (encoded & SUBMASK) == Lm; }
-    inline bool is_Lo() const { return (encoded & SUBMASK) == Lo; }
-    inline bool is_Lt() const { return (encoded & SUBMASK) == Lt; }
-    inline bool is_Lu() const { return (encoded & SUBMASK) == Lu; }
-    inline bool is_Mc() const { return (encoded & SUBMASK) == Mc; }
-    inline bool is_Me() const { return (encoded & SUBMASK) == Me; }
-    inline bool is_Mn() const { return (encoded & SUBMASK) == Mn; }
-    inline bool is_Nd() const { return (encoded & SUBMASK) == Nd; }
-    inline bool is_Nl() const { return (encoded & SUBMASK) == Nl; }
-    inline bool is_No() const { return (encoded & SUBMASK) == No; }
-    inline bool is_Pc() const { return (encoded & SUBMASK) == Pc; }
-    inline bool is_Pd() const { return (encoded & SUBMASK) == Pd; }
-    inline bool is_Pe() const { return (encoded & SUBMASK) == Pe; }
-    inline bool is_Pf() const { return (encoded & SUBMASK) == Pf; }
-    inline bool is_Pi() const { return (encoded & SUBMASK) == Pi; }
-    inline bool is_Po() const { return (encoded & SUBMASK) == Po; }
-    inline bool is_Ps() const { return (encoded & SUBMASK) == Ps; }
-    inline bool is_Sc() const { return (encoded & SUBMASK) == Sc; }
-    inline bool is_Sk() const { return (encoded & SUBMASK) == Sk; }
-    inline bool is_Sm() const { return (encoded & SUBMASK) == Sm; }
-    inline bool is_So() const { return (encoded & SUBMASK) == So; }
-    inline bool is_Zl() const { return (encoded & SUBMASK) == Zl; }
-    inline bool is_Zp() const { return (encoded & SUBMASK) == Zp; }
-    inline bool is_Zs() const { return (encoded & SUBMASK) == Zs; }
+    inline bool is_Cc() const { return (encoded & MASK) == Cc; }
+    inline bool is_Cf() const { return (encoded & MASK) == Cf; }
+    inline bool is_Co() const { return (encoded & MASK) == Co; }
+    inline bool is_Cs() const { return (encoded & MASK) == Cs; }
+    inline bool is_Ll() const { return (encoded & MASK) == Ll; }
+    inline bool is_Lm() const { return (encoded & MASK) == Lm; }
+    inline bool is_Lo() const { return (encoded & MASK) == Lo; }
+    inline bool is_Lt() const { return (encoded & MASK) == Lt; }
+    inline bool is_Lu() const { return (encoded & MASK) == Lu; }
+    inline bool is_Mc() const { return (encoded & MASK) == Mc; }
+    inline bool is_Me() const { return (encoded & MASK) == Me; }
+    inline bool is_Mn() const { return (encoded & MASK) == Mn; }
+    inline bool is_Nd() const { return (encoded & MASK) == Nd; }
+    inline bool is_Nl() const { return (encoded & MASK) == Nl; }
+    inline bool is_No() const { return (encoded & MASK) == No; }
+    inline bool is_Pc() const { return (encoded & MASK) == Pc; }
+    inline bool is_Pd() const { return (encoded & MASK) == Pd; }
+    inline bool is_Pe() const { return (encoded & MASK) == Pe; }
+    inline bool is_Pf() const { return (encoded & MASK) == Pf; }
+    inline bool is_Pi() const { return (encoded & MASK) == Pi; }
+    inline bool is_Po() const { return (encoded & MASK) == Po; }
+    inline bool is_Ps() const { return (encoded & MASK) == Ps; }
+    inline bool is_Sc() const { return (encoded & MASK) == Sc; }
+    inline bool is_Sk() const { return (encoded & MASK) == Sk; }
+    inline bool is_Sm() const { return (encoded & MASK) == Sm; }
+    inline bool is_So() const { return (encoded & MASK) == So; }
+    inline bool is_Zl() const { return (encoded & MASK) == Zl; }
+    inline bool is_Zp() const { return (encoded & MASK) == Zp; }
+    inline bool is_Zs() const { return (encoded & MASK) == Zs; }
 
     inline bool operator == (const codepoint_categ other) const {
         return encoded == other.encoded;
@@ -132,7 +129,7 @@ struct codepoint_categ {
             {Pd, "Pd"}, {Pe, "Pe"}, {Pf, "Pf"}, {Pi, "Pi"}, {Po, "Po"}, {Ps, "Ps"}, {Sc, "Sc"}, {Sk, "Sk"},
             {Sm, "Sm"}, {So, "So"}, {Zl, "Zl"}, {Zp, "Zp"}, {Zs, "Zs"},
         };
-        const auto it = map.find(encoded & SUBMASK);
+        const auto it = map.find(encoded & MASK);
         return it == map.end() ? "INVALID" : it->second;
     }
 
@@ -149,18 +146,19 @@ struct codepoint_categ {
                 return 0;
             }
             const char * p = strchr(subcategs, subcateg);
-            return (uint16_t) (p ? (p - subcategs + 1) : 0);
+            GGML_ASSERT(p);
+            return (uint16_t) (p - subcategs + 1);
         };
         switch(categ) {
             case 'C':  if(subcateg == 'n') return 0;  // undefined
-                       return C | (_subindex(subcateg, "cfos"   ) << 7);
-            case 'L':  return L | (_subindex(subcateg, "lmotu"  ) << 7);
-            case 'M':  return M | (_subindex(subcateg, "cen"    ) << 7);
-            case 'N':  return N | (_subindex(subcateg, "dlo"    ) << 7);
-            case 'P':  return P | (_subindex(subcateg, "cdefios") << 7);
-            case 'S':  return S | (_subindex(subcateg, "ckmo"   ) << 7);
-            case 'Z':  return Z | (_subindex(subcateg, "lps"    ) << 7);
-            default:   assert (false);  return 0;
+                       return C | _subindex(subcateg, "cfos"   );
+            case 'L':  return L | _subindex(subcateg, "lmotu"  );
+            case 'M':  return M | _subindex(subcateg, "cen"    );
+            case 'N':  return N | _subindex(subcateg, "dlo"    );
+            case 'P':  return P | _subindex(subcateg, "cdefios");
+            case 'S':  return S | _subindex(subcateg, "ckmo"   );
+            case 'Z':  return Z | _subindex(subcateg, "lps"    );
+            default:   GGML_ABORT("invalid category character");
         }
     }