Update codepoint_categ:
- Reorganize category/subcategory bits. - Regex flags for \s \w \d.
This commit is contained in:
parent
312c4322cc
commit
b565148cb4
2 changed files with 96 additions and 102 deletions
|
@ -209,7 +209,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
|||
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
|
||||
};
|
||||
|
||||
static const codepoint_categ SENTINEL = codepoint_categ::MASK + 1;
|
||||
static const codepoint_categ SENTINEL = codepoint_categ::UNDEF + 1;
|
||||
auto _get_categ = [&] (const size_t pos) -> codepoint_categ {
|
||||
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_category(cpts[pos]) : SENTINEL;
|
||||
};
|
||||
|
@ -328,7 +328,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
|||
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
|
||||
};
|
||||
|
||||
static const codepoint_categ SENTINEL = codepoint_categ::MASK + 1;
|
||||
static const codepoint_categ SENTINEL = codepoint_categ::UNDEF + 1;
|
||||
auto _get_categ = [&] (const size_t pos) -> codepoint_categ {
|
||||
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_category(cpts[pos]) : SENTINEL;
|
||||
};
|
||||
|
@ -589,28 +589,24 @@ codepoint_categ unicode_cpt_category(const uint32_t cp) {
|
|||
for (uint16_t rle : unicode_rle_codepoints_categs) {
|
||||
const uint32_t index = rle & 31;
|
||||
const uint32_t count = rle >> 5;
|
||||
const auto categ = codepoint_categ::from_index(index);
|
||||
//printf( "Codepoints 0x%05X to 0x%05X categ %s\n", cpt, cpt + count, categ.c_str());
|
||||
auto categ = codepoint_categ::from_index(index);
|
||||
//printf("Codepoints 0x%05X to 0x%05X categ %s\n", cpt, cpt + count, categ.c_str());
|
||||
categ.set_flag(codepoint_categ::DIGITS, categ.is_Nd()); // \d --> \p{Nd}
|
||||
categ.set_flag(codepoint_categ::WORDS, categ.is_L() | categ.is_N()); // \w --> \p{L} \p{N} _
|
||||
for (uint32_t i = 0; i <= count; ++i) {
|
||||
cpt_categs[cpt++] = categ;
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(cpt == MAX_CODEPOINTS);
|
||||
|
||||
cpt_categs['_'].set_flag(codepoint_categ::WORDS); // \w --> \p{L} \p{N} _
|
||||
|
||||
for (auto p : unicode_ranges_whitespace) {
|
||||
for (uint32_t cpt = p.first; cpt <= p.second; ++cpt) {
|
||||
cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACE);
|
||||
cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACES);
|
||||
}
|
||||
}
|
||||
|
||||
for (auto p : unicode_map_lowercase) {
|
||||
cpt_categs[p.second].set_flag(codepoint_categ::LOWERCASE);
|
||||
}
|
||||
|
||||
for (auto p : unicode_map_uppercase) {
|
||||
cpt_categs[p.second].set_flag(codepoint_categ::UPPERCASE);
|
||||
}
|
||||
|
||||
//for (auto &range : unicode_ranges_nfd) { // start, last, nfd
|
||||
// cpt_categs[cpt].set_flag(codepoint_categ::NORM_NFD);
|
||||
//}
|
||||
|
|
174
src/unicode.h
174
src/unicode.h
|
@ -9,74 +9,71 @@
|
|||
#include <map>
|
||||
|
||||
struct codepoint_categ {
|
||||
// 0bffffff'ccccccc'sss --> 6 bits flags + 7 bits category + 3 bits subcategory
|
||||
enum _category : uint16_t {
|
||||
UNDEF = 0, // \p{Cn} Undefined
|
||||
C = 1 << 0, // \p{C} Control
|
||||
L = 1 << 1, // \p{L} Letter
|
||||
M = 1 << 2, // \p{M} Mark
|
||||
N = 1 << 3, // \p{N} Number
|
||||
P = 1 << 4, // \p{P} Punctuation
|
||||
S = 1 << 5, // \p{S} Symbol
|
||||
Z = 1 << 6, // \p{Z} Separator
|
||||
MASK = (1 << 7) - 1 // 7 bits
|
||||
};
|
||||
|
||||
enum _subcategory : uint16_t {
|
||||
Cc = C | (1 << 7), // \p{Cc} Control
|
||||
Cf = C | (2 << 7), // \p{Cf} Format
|
||||
Co = C | (3 << 7), // \p{Co} Private Use
|
||||
Cs = C | (4 << 7), // \p{Cs} Surrrogate
|
||||
Ll = L | (1 << 7), // \p{Ll} Lowercase Letter
|
||||
Lm = L | (2 << 7), // \p{Lm} Modifier Letter
|
||||
Lo = L | (3 << 7), // \p{Lo} Other Letter
|
||||
Lt = L | (4 << 7), // \p{Lt} Titlecase Letter
|
||||
Lu = L | (5 << 7), // \p{Lu} Uppercase Letter
|
||||
Mc = M | (1 << 7), // \p{Mc} Spacing Mark
|
||||
Me = M | (2 << 7), // \p{Me} Enclosing Mark
|
||||
Mn = M | (3 << 7), // \p{Mn} Nonspacing Mark
|
||||
Nd = N | (1 << 7), // \p{Nd} Decimal Number
|
||||
Nl = N | (2 << 7), // \p{Nl} Letter Number
|
||||
No = N | (3 << 7), // \p{No} Other Number
|
||||
Pc = P | (1 << 7), // \p{Pc} Connector Punctuation
|
||||
Pd = P | (2 << 7), // \p{Pd} Dash Punctuation
|
||||
Pe = P | (3 << 7), // \p{Pe} Close Punctuation
|
||||
Pf = P | (4 << 7), // \p{Pf} Final Punctuation
|
||||
Pi = P | (5 << 7), // \p{Pi} Initial Punctuation
|
||||
Po = P | (6 << 7), // \p{Po} Other Punctuation
|
||||
Ps = P | (7 << 7), // \p{Ps} Open Punctuation
|
||||
Sc = S | (1 << 7), // \p{Sc} Currency Symbol
|
||||
Sk = S | (2 << 7), // \p{Sk} Modifier Symbol
|
||||
Sm = S | (3 << 7), // \p{Sm} Math Symbol
|
||||
So = S | (4 << 7), // \p{So} Other Symbol
|
||||
Zl = Z | (1 << 7), // \p{Zl} Line Separator
|
||||
Zp = Z | (2 << 7), // \p{Zp} Paragraph Separator
|
||||
Zs = Z | (3 << 7), // \p{Zs} Space Separator
|
||||
SUBMASK = (1 << 10) - 1 // 7+3 bits
|
||||
C = 1 << (0 + 3), // \p{C} Control
|
||||
L = 1 << (1 + 3), // \p{L} Letter
|
||||
M = 1 << (2 + 3), // \p{M} Mark
|
||||
N = 1 << (3 + 3), // \p{N} Number
|
||||
P = 1 << (4 + 3), // \p{P} Punctuation
|
||||
S = 1 << (5 + 3), // \p{S} Symbol
|
||||
Z = 1 << (6 + 3), // \p{Z} Separator
|
||||
Cc = C | 1, // \p{Cc} Control
|
||||
Cf = C | 2, // \p{Cf} Format
|
||||
Co = C | 3, // \p{Co} Private Use
|
||||
Cs = C | 4, // \p{Cs} Surrrogate
|
||||
Ll = L | 1, // \p{Ll} Lowercase Letter
|
||||
Lm = L | 2, // \p{Lm} Modifier Letter
|
||||
Lo = L | 3, // \p{Lo} Other Letter
|
||||
Lt = L | 4, // \p{Lt} Titlecase Letter
|
||||
Lu = L | 5, // \p{Lu} Uppercase Letter
|
||||
Mc = M | 1, // \p{Mc} Spacing Mark
|
||||
Me = M | 2, // \p{Me} Enclosing Mark
|
||||
Mn = M | 3, // \p{Mn} Nonspacing Mark
|
||||
Nd = N | 1, // \p{Nd} Decimal Number
|
||||
Nl = N | 2, // \p{Nl} Letter Number
|
||||
No = N | 3, // \p{No} Other Number
|
||||
Pc = P | 1, // \p{Pc} Connector Punctuation
|
||||
Pd = P | 2, // \p{Pd} Dash Punctuation
|
||||
Pe = P | 3, // \p{Pe} Close Punctuation
|
||||
Pf = P | 4, // \p{Pf} Final Punctuation
|
||||
Pi = P | 5, // \p{Pi} Initial Punctuation
|
||||
Po = P | 6, // \p{Po} Other Punctuation
|
||||
Ps = P | 7, // \p{Ps} Open Punctuation
|
||||
Sc = S | 1, // \p{Sc} Currency Symbol
|
||||
Sk = S | 2, // \p{Sk} Modifier Symbol
|
||||
Sm = S | 3, // \p{Sm} Math Symbol
|
||||
So = S | 4, // \p{So} Other Symbol
|
||||
Zl = Z | 1, // \p{Zl} Line Separator
|
||||
Zp = Z | 2, // \p{Zp} Paragraph Separator
|
||||
Zs = Z | 3, // \p{Zs} Space Separator
|
||||
SUBMASK = (1 << 3) - 1, // 3 bits 0b000000'0000000'111
|
||||
MASK = (1 << 10) - 1, // 7+3 bits 0b000000'1111111'111
|
||||
};
|
||||
|
||||
enum _flags : uint16_t {
|
||||
WHITESPACE = (1 << 10), // regex: \s
|
||||
LOWERCASE = (1 << 11),
|
||||
UPPERCASE = (1 << 12),
|
||||
WHITESPACES = (1 << 10), // regex: \s
|
||||
WORDS = (1 << 11), // regex: \w
|
||||
DIGITS = (1 << 12), // regex: \d
|
||||
//Norm NFD/NFC = ...,
|
||||
};
|
||||
|
||||
inline codepoint_categ(const uint16_t categ=0) : encoded{categ} {}
|
||||
|
||||
inline void set_flag(_flags flags, bool value = true) {
|
||||
flags = (_flags) (flags & ~SUBMASK); // ignore category bits
|
||||
flags = (_flags) (flags & ~MASK); // do not modify category bits
|
||||
encoded = value ? (encoded | flags) : (encoded & ~flags);
|
||||
}
|
||||
|
||||
inline uint16_t get_category() const { return encoded & MASK; }
|
||||
inline uint16_t get_subcategory() const { return encoded & SUBMASK; }
|
||||
|
||||
inline bool is_undefined() const { return !encoded; }
|
||||
inline bool is_defined() const { return encoded; }
|
||||
|
||||
inline uint16_t is_whitespace() const { return encoded & WHITESPACE; }
|
||||
inline uint16_t is_lowercase() const { return encoded & LOWERCASE; }
|
||||
inline uint16_t is_uppercase() const { return encoded & UPPERCASE; }
|
||||
inline uint16_t is_whitespace() const { return encoded & WHITESPACES; }
|
||||
inline uint16_t is_word() const { return encoded & WORDS; }
|
||||
inline uint16_t is_digit() const { return encoded & DIGITS; }
|
||||
|
||||
inline uint16_t is_C() const { return encoded & C; }
|
||||
inline uint16_t is_L() const { return encoded & L; }
|
||||
|
@ -86,35 +83,35 @@ struct codepoint_categ {
|
|||
inline uint16_t is_S() const { return encoded & S; }
|
||||
inline uint16_t is_Z() const { return encoded & Z; }
|
||||
|
||||
inline bool is_Cc() const { return (encoded & SUBMASK) == Cc; }
|
||||
inline bool is_Cf() const { return (encoded & SUBMASK) == Cf; }
|
||||
inline bool is_Co() const { return (encoded & SUBMASK) == Co; }
|
||||
inline bool is_Cs() const { return (encoded & SUBMASK) == Cs; }
|
||||
inline bool is_Ll() const { return (encoded & SUBMASK) == Ll; }
|
||||
inline bool is_Lm() const { return (encoded & SUBMASK) == Lm; }
|
||||
inline bool is_Lo() const { return (encoded & SUBMASK) == Lo; }
|
||||
inline bool is_Lt() const { return (encoded & SUBMASK) == Lt; }
|
||||
inline bool is_Lu() const { return (encoded & SUBMASK) == Lu; }
|
||||
inline bool is_Mc() const { return (encoded & SUBMASK) == Mc; }
|
||||
inline bool is_Me() const { return (encoded & SUBMASK) == Me; }
|
||||
inline bool is_Mn() const { return (encoded & SUBMASK) == Mn; }
|
||||
inline bool is_Nd() const { return (encoded & SUBMASK) == Nd; }
|
||||
inline bool is_Nl() const { return (encoded & SUBMASK) == Nl; }
|
||||
inline bool is_No() const { return (encoded & SUBMASK) == No; }
|
||||
inline bool is_Pc() const { return (encoded & SUBMASK) == Pc; }
|
||||
inline bool is_Pd() const { return (encoded & SUBMASK) == Pd; }
|
||||
inline bool is_Pe() const { return (encoded & SUBMASK) == Pe; }
|
||||
inline bool is_Pf() const { return (encoded & SUBMASK) == Pf; }
|
||||
inline bool is_Pi() const { return (encoded & SUBMASK) == Pi; }
|
||||
inline bool is_Po() const { return (encoded & SUBMASK) == Po; }
|
||||
inline bool is_Ps() const { return (encoded & SUBMASK) == Ps; }
|
||||
inline bool is_Sc() const { return (encoded & SUBMASK) == Sc; }
|
||||
inline bool is_Sk() const { return (encoded & SUBMASK) == Sk; }
|
||||
inline bool is_Sm() const { return (encoded & SUBMASK) == Sm; }
|
||||
inline bool is_So() const { return (encoded & SUBMASK) == So; }
|
||||
inline bool is_Zl() const { return (encoded & SUBMASK) == Zl; }
|
||||
inline bool is_Zp() const { return (encoded & SUBMASK) == Zp; }
|
||||
inline bool is_Zs() const { return (encoded & SUBMASK) == Zs; }
|
||||
inline bool is_Cc() const { return (encoded & MASK) == Cc; }
|
||||
inline bool is_Cf() const { return (encoded & MASK) == Cf; }
|
||||
inline bool is_Co() const { return (encoded & MASK) == Co; }
|
||||
inline bool is_Cs() const { return (encoded & MASK) == Cs; }
|
||||
inline bool is_Ll() const { return (encoded & MASK) == Ll; }
|
||||
inline bool is_Lm() const { return (encoded & MASK) == Lm; }
|
||||
inline bool is_Lo() const { return (encoded & MASK) == Lo; }
|
||||
inline bool is_Lt() const { return (encoded & MASK) == Lt; }
|
||||
inline bool is_Lu() const { return (encoded & MASK) == Lu; }
|
||||
inline bool is_Mc() const { return (encoded & MASK) == Mc; }
|
||||
inline bool is_Me() const { return (encoded & MASK) == Me; }
|
||||
inline bool is_Mn() const { return (encoded & MASK) == Mn; }
|
||||
inline bool is_Nd() const { return (encoded & MASK) == Nd; }
|
||||
inline bool is_Nl() const { return (encoded & MASK) == Nl; }
|
||||
inline bool is_No() const { return (encoded & MASK) == No; }
|
||||
inline bool is_Pc() const { return (encoded & MASK) == Pc; }
|
||||
inline bool is_Pd() const { return (encoded & MASK) == Pd; }
|
||||
inline bool is_Pe() const { return (encoded & MASK) == Pe; }
|
||||
inline bool is_Pf() const { return (encoded & MASK) == Pf; }
|
||||
inline bool is_Pi() const { return (encoded & MASK) == Pi; }
|
||||
inline bool is_Po() const { return (encoded & MASK) == Po; }
|
||||
inline bool is_Ps() const { return (encoded & MASK) == Ps; }
|
||||
inline bool is_Sc() const { return (encoded & MASK) == Sc; }
|
||||
inline bool is_Sk() const { return (encoded & MASK) == Sk; }
|
||||
inline bool is_Sm() const { return (encoded & MASK) == Sm; }
|
||||
inline bool is_So() const { return (encoded & MASK) == So; }
|
||||
inline bool is_Zl() const { return (encoded & MASK) == Zl; }
|
||||
inline bool is_Zp() const { return (encoded & MASK) == Zp; }
|
||||
inline bool is_Zs() const { return (encoded & MASK) == Zs; }
|
||||
|
||||
inline bool operator == (const codepoint_categ other) const {
|
||||
return encoded == other.encoded;
|
||||
|
@ -132,7 +129,7 @@ struct codepoint_categ {
|
|||
{Pd, "Pd"}, {Pe, "Pe"}, {Pf, "Pf"}, {Pi, "Pi"}, {Po, "Po"}, {Ps, "Ps"}, {Sc, "Sc"}, {Sk, "Sk"},
|
||||
{Sm, "Sm"}, {So, "So"}, {Zl, "Zl"}, {Zp, "Zp"}, {Zs, "Zs"},
|
||||
};
|
||||
const auto it = map.find(encoded & SUBMASK);
|
||||
const auto it = map.find(encoded & MASK);
|
||||
return it == map.end() ? "INVALID" : it->second;
|
||||
}
|
||||
|
||||
|
@ -149,18 +146,19 @@ struct codepoint_categ {
|
|||
return 0;
|
||||
}
|
||||
const char * p = strchr(subcategs, subcateg);
|
||||
return (uint16_t) (p ? (p - subcategs + 1) : 0);
|
||||
GGML_ASSERT(p);
|
||||
return (uint16_t) (p - subcategs + 1);
|
||||
};
|
||||
switch(categ) {
|
||||
case 'C': if(subcateg == 'n') return 0; // undefined
|
||||
return C | (_subindex(subcateg, "cfos" ) << 7);
|
||||
case 'L': return L | (_subindex(subcateg, "lmotu" ) << 7);
|
||||
case 'M': return M | (_subindex(subcateg, "cen" ) << 7);
|
||||
case 'N': return N | (_subindex(subcateg, "dlo" ) << 7);
|
||||
case 'P': return P | (_subindex(subcateg, "cdefios") << 7);
|
||||
case 'S': return S | (_subindex(subcateg, "ckmo" ) << 7);
|
||||
case 'Z': return Z | (_subindex(subcateg, "lps" ) << 7);
|
||||
default: assert (false); return 0;
|
||||
return C | _subindex(subcateg, "cfos" );
|
||||
case 'L': return L | _subindex(subcateg, "lmotu" );
|
||||
case 'M': return M | _subindex(subcateg, "cen" );
|
||||
case 'N': return N | _subindex(subcateg, "dlo" );
|
||||
case 'P': return P | _subindex(subcateg, "cdefios");
|
||||
case 'S': return S | _subindex(subcateg, "ckmo" );
|
||||
case 'Z': return Z | _subindex(subcateg, "lps" );
|
||||
default: GGML_ABORT("invalid category character");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue