Update codepoint_categ:

- Reorganize category/subcategory bits.
- Regex flags for \s \w \d.
This commit is contained in:
jaime-m-p 2024-08-13 16:42:33 +02:00
parent 312c4322cc
commit b565148cb4
2 changed files with 96 additions and 102 deletions

View file

@ -209,7 +209,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE; return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
}; };
static const codepoint_categ SENTINEL = codepoint_categ::MASK + 1; static const codepoint_categ SENTINEL = codepoint_categ::UNDEF + 1;
auto _get_categ = [&] (const size_t pos) -> codepoint_categ { auto _get_categ = [&] (const size_t pos) -> codepoint_categ {
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_category(cpts[pos]) : SENTINEL; return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_category(cpts[pos]) : SENTINEL;
}; };
@ -328,7 +328,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE; return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
}; };
static const codepoint_categ SENTINEL = codepoint_categ::MASK + 1; static const codepoint_categ SENTINEL = codepoint_categ::UNDEF + 1;
auto _get_categ = [&] (const size_t pos) -> codepoint_categ { auto _get_categ = [&] (const size_t pos) -> codepoint_categ {
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_category(cpts[pos]) : SENTINEL; return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_category(cpts[pos]) : SENTINEL;
}; };
@ -589,28 +589,24 @@ codepoint_categ unicode_cpt_category(const uint32_t cp) {
for (uint16_t rle : unicode_rle_codepoints_categs) { for (uint16_t rle : unicode_rle_codepoints_categs) {
const uint32_t index = rle & 31; const uint32_t index = rle & 31;
const uint32_t count = rle >> 5; const uint32_t count = rle >> 5;
const auto categ = codepoint_categ::from_index(index); auto categ = codepoint_categ::from_index(index);
//printf( "Codepoints 0x%05X to 0x%05X categ %s\n", cpt, cpt + count, categ.c_str()); //printf("Codepoints 0x%05X to 0x%05X categ %s\n", cpt, cpt + count, categ.c_str());
categ.set_flag(codepoint_categ::DIGITS, categ.is_Nd()); // \d --> \p{Nd}
categ.set_flag(codepoint_categ::WORDS, categ.is_L() | categ.is_N()); // \w --> \p{L} \p{N} _
for (uint32_t i = 0; i <= count; ++i) { for (uint32_t i = 0; i <= count; ++i) {
cpt_categs[cpt++] = categ; cpt_categs[cpt++] = categ;
} }
} }
GGML_ASSERT(cpt == MAX_CODEPOINTS); GGML_ASSERT(cpt == MAX_CODEPOINTS);
cpt_categs['_'].set_flag(codepoint_categ::WORDS); // \w --> \p{L} \p{N} _
for (auto p : unicode_ranges_whitespace) { for (auto p : unicode_ranges_whitespace) {
for (uint32_t cpt = p.first; cpt <= p.second; ++cpt) { for (uint32_t cpt = p.first; cpt <= p.second; ++cpt) {
cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACE); cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACES);
} }
} }
for (auto p : unicode_map_lowercase) {
cpt_categs[p.second].set_flag(codepoint_categ::LOWERCASE);
}
for (auto p : unicode_map_uppercase) {
cpt_categs[p.second].set_flag(codepoint_categ::UPPERCASE);
}
//for (auto &range : unicode_ranges_nfd) { // start, last, nfd //for (auto &range : unicode_ranges_nfd) { // start, last, nfd
// cpt_categs[cpt].set_flag(codepoint_categ::NORM_NFD); // cpt_categs[cpt].set_flag(codepoint_categ::NORM_NFD);
//} //}

View file

@ -9,74 +9,71 @@
#include <map> #include <map>
struct codepoint_categ { struct codepoint_categ {
// 0bffffff'ccccccc'sss --> 6 bits flags + 7 bits category + 3 bits subcategory
enum _category : uint16_t { enum _category : uint16_t {
UNDEF = 0, // \p{Cn} Undefined UNDEF = 0, // \p{Cn} Undefined
C = 1 << 0, // \p{C} Control C = 1 << (0 + 3), // \p{C} Control
L = 1 << 1, // \p{L} Letter L = 1 << (1 + 3), // \p{L} Letter
M = 1 << 2, // \p{M} Mark M = 1 << (2 + 3), // \p{M} Mark
N = 1 << 3, // \p{N} Number N = 1 << (3 + 3), // \p{N} Number
P = 1 << 4, // \p{P} Punctuation P = 1 << (4 + 3), // \p{P} Punctuation
S = 1 << 5, // \p{S} Symbol S = 1 << (5 + 3), // \p{S} Symbol
Z = 1 << 6, // \p{Z} Separator Z = 1 << (6 + 3), // \p{Z} Separator
MASK = (1 << 7) - 1 // 7 bits Cc = C | 1, // \p{Cc} Control
}; Cf = C | 2, // \p{Cf} Format
Co = C | 3, // \p{Co} Private Use
enum _subcategory : uint16_t { Cs = C | 4, // \p{Cs} Surrrogate
Cc = C | (1 << 7), // \p{Cc} Control Ll = L | 1, // \p{Ll} Lowercase Letter
Cf = C | (2 << 7), // \p{Cf} Format Lm = L | 2, // \p{Lm} Modifier Letter
Co = C | (3 << 7), // \p{Co} Private Use Lo = L | 3, // \p{Lo} Other Letter
Cs = C | (4 << 7), // \p{Cs} Surrrogate Lt = L | 4, // \p{Lt} Titlecase Letter
Ll = L | (1 << 7), // \p{Ll} Lowercase Letter Lu = L | 5, // \p{Lu} Uppercase Letter
Lm = L | (2 << 7), // \p{Lm} Modifier Letter Mc = M | 1, // \p{Mc} Spacing Mark
Lo = L | (3 << 7), // \p{Lo} Other Letter Me = M | 2, // \p{Me} Enclosing Mark
Lt = L | (4 << 7), // \p{Lt} Titlecase Letter Mn = M | 3, // \p{Mn} Nonspacing Mark
Lu = L | (5 << 7), // \p{Lu} Uppercase Letter Nd = N | 1, // \p{Nd} Decimal Number
Mc = M | (1 << 7), // \p{Mc} Spacing Mark Nl = N | 2, // \p{Nl} Letter Number
Me = M | (2 << 7), // \p{Me} Enclosing Mark No = N | 3, // \p{No} Other Number
Mn = M | (3 << 7), // \p{Mn} Nonspacing Mark Pc = P | 1, // \p{Pc} Connector Punctuation
Nd = N | (1 << 7), // \p{Nd} Decimal Number Pd = P | 2, // \p{Pd} Dash Punctuation
Nl = N | (2 << 7), // \p{Nl} Letter Number Pe = P | 3, // \p{Pe} Close Punctuation
No = N | (3 << 7), // \p{No} Other Number Pf = P | 4, // \p{Pf} Final Punctuation
Pc = P | (1 << 7), // \p{Pc} Connector Punctuation Pi = P | 5, // \p{Pi} Initial Punctuation
Pd = P | (2 << 7), // \p{Pd} Dash Punctuation Po = P | 6, // \p{Po} Other Punctuation
Pe = P | (3 << 7), // \p{Pe} Close Punctuation Ps = P | 7, // \p{Ps} Open Punctuation
Pf = P | (4 << 7), // \p{Pf} Final Punctuation Sc = S | 1, // \p{Sc} Currency Symbol
Pi = P | (5 << 7), // \p{Pi} Initial Punctuation Sk = S | 2, // \p{Sk} Modifier Symbol
Po = P | (6 << 7), // \p{Po} Other Punctuation Sm = S | 3, // \p{Sm} Math Symbol
Ps = P | (7 << 7), // \p{Ps} Open Punctuation So = S | 4, // \p{So} Other Symbol
Sc = S | (1 << 7), // \p{Sc} Currency Symbol Zl = Z | 1, // \p{Zl} Line Separator
Sk = S | (2 << 7), // \p{Sk} Modifier Symbol Zp = Z | 2, // \p{Zp} Paragraph Separator
Sm = S | (3 << 7), // \p{Sm} Math Symbol Zs = Z | 3, // \p{Zs} Space Separator
So = S | (4 << 7), // \p{So} Other Symbol SUBMASK = (1 << 3) - 1, // 3 bits 0b000000'0000000'111
Zl = Z | (1 << 7), // \p{Zl} Line Separator MASK = (1 << 10) - 1, // 7+3 bits 0b000000'1111111'111
Zp = Z | (2 << 7), // \p{Zp} Paragraph Separator
Zs = Z | (3 << 7), // \p{Zs} Space Separator
SUBMASK = (1 << 10) - 1 // 7+3 bits
}; };
enum _flags : uint16_t { enum _flags : uint16_t {
WHITESPACE = (1 << 10), // regex: \s WHITESPACES = (1 << 10), // regex: \s
LOWERCASE = (1 << 11), WORDS = (1 << 11), // regex: \w
UPPERCASE = (1 << 12), DIGITS = (1 << 12), // regex: \d
//Norm NFD/NFC = ..., //Norm NFD/NFC = ...,
}; };
inline codepoint_categ(const uint16_t categ=0) : encoded{categ} {} inline codepoint_categ(const uint16_t categ=0) : encoded{categ} {}
inline void set_flag(_flags flags, bool value = true) { inline void set_flag(_flags flags, bool value = true) {
flags = (_flags) (flags & ~SUBMASK); // ignore category bits flags = (_flags) (flags & ~MASK); // do not modify category bits
encoded = value ? (encoded | flags) : (encoded & ~flags); encoded = value ? (encoded | flags) : (encoded & ~flags);
} }
inline uint16_t get_category() const { return encoded & MASK; } inline uint16_t get_category() const { return encoded & MASK; }
inline uint16_t get_subcategory() const { return encoded & SUBMASK; }
inline bool is_undefined() const { return !encoded; } inline bool is_undefined() const { return !encoded; }
inline bool is_defined() const { return encoded; } inline bool is_defined() const { return encoded; }
inline uint16_t is_whitespace() const { return encoded & WHITESPACE; } inline uint16_t is_whitespace() const { return encoded & WHITESPACES; }
inline uint16_t is_lowercase() const { return encoded & LOWERCASE; } inline uint16_t is_word() const { return encoded & WORDS; }
inline uint16_t is_uppercase() const { return encoded & UPPERCASE; } inline uint16_t is_digit() const { return encoded & DIGITS; }
inline uint16_t is_C() const { return encoded & C; } inline uint16_t is_C() const { return encoded & C; }
inline uint16_t is_L() const { return encoded & L; } inline uint16_t is_L() const { return encoded & L; }
@ -86,35 +83,35 @@ struct codepoint_categ {
inline uint16_t is_S() const { return encoded & S; } inline uint16_t is_S() const { return encoded & S; }
inline uint16_t is_Z() const { return encoded & Z; } inline uint16_t is_Z() const { return encoded & Z; }
inline bool is_Cc() const { return (encoded & SUBMASK) == Cc; } inline bool is_Cc() const { return (encoded & MASK) == Cc; }
inline bool is_Cf() const { return (encoded & SUBMASK) == Cf; } inline bool is_Cf() const { return (encoded & MASK) == Cf; }
inline bool is_Co() const { return (encoded & SUBMASK) == Co; } inline bool is_Co() const { return (encoded & MASK) == Co; }
inline bool is_Cs() const { return (encoded & SUBMASK) == Cs; } inline bool is_Cs() const { return (encoded & MASK) == Cs; }
inline bool is_Ll() const { return (encoded & SUBMASK) == Ll; } inline bool is_Ll() const { return (encoded & MASK) == Ll; }
inline bool is_Lm() const { return (encoded & SUBMASK) == Lm; } inline bool is_Lm() const { return (encoded & MASK) == Lm; }
inline bool is_Lo() const { return (encoded & SUBMASK) == Lo; } inline bool is_Lo() const { return (encoded & MASK) == Lo; }
inline bool is_Lt() const { return (encoded & SUBMASK) == Lt; } inline bool is_Lt() const { return (encoded & MASK) == Lt; }
inline bool is_Lu() const { return (encoded & SUBMASK) == Lu; } inline bool is_Lu() const { return (encoded & MASK) == Lu; }
inline bool is_Mc() const { return (encoded & SUBMASK) == Mc; } inline bool is_Mc() const { return (encoded & MASK) == Mc; }
inline bool is_Me() const { return (encoded & SUBMASK) == Me; } inline bool is_Me() const { return (encoded & MASK) == Me; }
inline bool is_Mn() const { return (encoded & SUBMASK) == Mn; } inline bool is_Mn() const { return (encoded & MASK) == Mn; }
inline bool is_Nd() const { return (encoded & SUBMASK) == Nd; } inline bool is_Nd() const { return (encoded & MASK) == Nd; }
inline bool is_Nl() const { return (encoded & SUBMASK) == Nl; } inline bool is_Nl() const { return (encoded & MASK) == Nl; }
inline bool is_No() const { return (encoded & SUBMASK) == No; } inline bool is_No() const { return (encoded & MASK) == No; }
inline bool is_Pc() const { return (encoded & SUBMASK) == Pc; } inline bool is_Pc() const { return (encoded & MASK) == Pc; }
inline bool is_Pd() const { return (encoded & SUBMASK) == Pd; } inline bool is_Pd() const { return (encoded & MASK) == Pd; }
inline bool is_Pe() const { return (encoded & SUBMASK) == Pe; } inline bool is_Pe() const { return (encoded & MASK) == Pe; }
inline bool is_Pf() const { return (encoded & SUBMASK) == Pf; } inline bool is_Pf() const { return (encoded & MASK) == Pf; }
inline bool is_Pi() const { return (encoded & SUBMASK) == Pi; } inline bool is_Pi() const { return (encoded & MASK) == Pi; }
inline bool is_Po() const { return (encoded & SUBMASK) == Po; } inline bool is_Po() const { return (encoded & MASK) == Po; }
inline bool is_Ps() const { return (encoded & SUBMASK) == Ps; } inline bool is_Ps() const { return (encoded & MASK) == Ps; }
inline bool is_Sc() const { return (encoded & SUBMASK) == Sc; } inline bool is_Sc() const { return (encoded & MASK) == Sc; }
inline bool is_Sk() const { return (encoded & SUBMASK) == Sk; } inline bool is_Sk() const { return (encoded & MASK) == Sk; }
inline bool is_Sm() const { return (encoded & SUBMASK) == Sm; } inline bool is_Sm() const { return (encoded & MASK) == Sm; }
inline bool is_So() const { return (encoded & SUBMASK) == So; } inline bool is_So() const { return (encoded & MASK) == So; }
inline bool is_Zl() const { return (encoded & SUBMASK) == Zl; } inline bool is_Zl() const { return (encoded & MASK) == Zl; }
inline bool is_Zp() const { return (encoded & SUBMASK) == Zp; } inline bool is_Zp() const { return (encoded & MASK) == Zp; }
inline bool is_Zs() const { return (encoded & SUBMASK) == Zs; } inline bool is_Zs() const { return (encoded & MASK) == Zs; }
inline bool operator == (const codepoint_categ other) const { inline bool operator == (const codepoint_categ other) const {
return encoded == other.encoded; return encoded == other.encoded;
@ -132,7 +129,7 @@ struct codepoint_categ {
{Pd, "Pd"}, {Pe, "Pe"}, {Pf, "Pf"}, {Pi, "Pi"}, {Po, "Po"}, {Ps, "Ps"}, {Sc, "Sc"}, {Sk, "Sk"}, {Pd, "Pd"}, {Pe, "Pe"}, {Pf, "Pf"}, {Pi, "Pi"}, {Po, "Po"}, {Ps, "Ps"}, {Sc, "Sc"}, {Sk, "Sk"},
{Sm, "Sm"}, {So, "So"}, {Zl, "Zl"}, {Zp, "Zp"}, {Zs, "Zs"}, {Sm, "Sm"}, {So, "So"}, {Zl, "Zl"}, {Zp, "Zp"}, {Zs, "Zs"},
}; };
const auto it = map.find(encoded & SUBMASK); const auto it = map.find(encoded & MASK);
return it == map.end() ? "INVALID" : it->second; return it == map.end() ? "INVALID" : it->second;
} }
@ -149,18 +146,19 @@ struct codepoint_categ {
return 0; return 0;
} }
const char * p = strchr(subcategs, subcateg); const char * p = strchr(subcategs, subcateg);
return (uint16_t) (p ? (p - subcategs + 1) : 0); GGML_ASSERT(p);
return (uint16_t) (p - subcategs + 1);
}; };
switch(categ) { switch(categ) {
case 'C': if(subcateg == 'n') return 0; // undefined case 'C': if(subcateg == 'n') return 0; // undefined
return C | (_subindex(subcateg, "cfos" ) << 7); return C | _subindex(subcateg, "cfos" );
case 'L': return L | (_subindex(subcateg, "lmotu" ) << 7); case 'L': return L | _subindex(subcateg, "lmotu" );
case 'M': return M | (_subindex(subcateg, "cen" ) << 7); case 'M': return M | _subindex(subcateg, "cen" );
case 'N': return N | (_subindex(subcateg, "dlo" ) << 7); case 'N': return N | _subindex(subcateg, "dlo" );
case 'P': return P | (_subindex(subcateg, "cdefios") << 7); case 'P': return P | _subindex(subcateg, "cdefios");
case 'S': return S | (_subindex(subcateg, "ckmo" ) << 7); case 'S': return S | _subindex(subcateg, "ckmo" );
case 'Z': return Z | (_subindex(subcateg, "lps" ) << 7); case 'Z': return Z | _subindex(subcateg, "lps" );
default: assert (false); return 0; default: GGML_ABORT("invalid category character");
} }
} }