From 2636cb61703d5984dc94d3d757d0c506ec783846 Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Sat, 20 Jul 2024 23:19:42 +0200 Subject: [PATCH] Decode unicode data categories --- scripts/gen-unicode-data.py | 3 +- src/unicode.cpp | 77 ++++++++++++++++++------------------- src/unicode.h | 26 +++++++++++++ 3 files changed, 65 insertions(+), 41 deletions(-) diff --git a/scripts/gen-unicode-data.py b/scripts/gen-unicode-data.py index 55ac0af12..542a9edba 100644 --- a/scripts/gen-unicode-data.py +++ b/scripts/gen-unicode-data.py @@ -49,6 +49,7 @@ def unicode_data_iter(): yield (cpt, cpt_lower, cpt_upper, categ, bidir) +# see codepoint_categ::from_index() in unicode.h UNICODE_CATEGORY_TO_INDEX = { "Cn": 0, # \p{Cn} Undefined "Cc": 1, # \p{Cc} Control @@ -123,7 +124,7 @@ table_uppercase.sort() table_nfd.sort() -# run length encoding +# run length encoding, see unicode_cpt_category() in unicode.cpp assert (max(UNICODE_CATEGORY_TO_INDEX.values()) < 32) codepoint_categs_runs = [codepoint_categs[0]] # 5 bits categ + 11 bits length for cpt, categ in enumerate(codepoint_categs[1:], 1): diff --git a/src/unicode.cpp b/src/unicode.cpp index e05fb9d17..a78c59f74 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -113,38 +113,6 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) { // return result; //} -static std::vector unicode_cpt_flags_array() { - std::vector cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED); - - assert (unicode_ranges_flags.front().first == 0); - assert (unicode_ranges_flags.back().first == MAX_CODEPOINTS); - for (size_t i = 1; i < unicode_ranges_flags.size(); ++i) { - const auto range_ini = unicode_ranges_flags[i-1]; // codepoint_ini, flags - const auto range_end = unicode_ranges_flags[i]; // codepoint_end, flags - for (uint32_t cpt = range_ini.first; cpt < range_end.first; ++cpt) { - cpt_flags[cpt] = range_ini.second; - } - } - - for (auto cpt : unicode_set_whitespace) { - cpt_flags[cpt].is_whitespace = true; - } - - for (auto p : unicode_map_lowercase) { - cpt_flags[p.second].is_lowercase = true; - } - - for (auto p : unicode_map_uppercase) { - cpt_flags[p.second].is_uppercase = true; - } - - for (auto &range : unicode_ranges_nfd) { // start, last, nfd - cpt_flags[range.nfd].is_nfd = true; - } - - return cpt_flags; -} - static std::unordered_map unicode_byte_to_utf8_map() { std::unordered_map map; for (int ch = 0x21; ch <= 0x7E; ++ch) { // u'!' to u'~' @@ -606,19 +574,48 @@ std::vector unicode_cpts_from_utf8(const std::string & utf8) { return result; } -codepoint_flags unicode_cpt_flags(const uint32_t cp) { - static const codepoint_flags undef(codepoint_flags::UNDEFINED); - static const auto cpt_flags = unicode_cpt_flags_array(); - return cp < cpt_flags.size() ? cpt_flags[cp] : undef; +codepoint_categ unicode_cpt_category(const uint32_t cp) { + static const std::vector cpt_categs = [] { + std::vector cpt_categs(MAX_CODEPOINTS, codepoint_categ::UNDEF); + uint32_t cpt = 0; + for (uint16_t rle : unicode_rle_codepoints_categs) { + const uint32_t index = rle & 31; + const uint32_t count = rle >> 5; + const auto categ = codepoint_categ::from_index(index); + //printf( "Codepoints 0x%05X to 0x%05X categ %s\n", cpt, cpt + count, categ.c_str()); + for (uint32_t i = 0; i <= count; ++i) { + cpt_categs[cpt++] = categ; + } + } + assert (cpt == MAX_CODEPOINTS); + + for (auto cpt : unicode_set_whitespace) { + cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACE); + } + + for (auto p : unicode_map_lowercase) { + cpt_categs[cpt].set_flag(codepoint_categ::LOWERCASE); + } + + for (auto p : unicode_map_uppercase) { + cpt_categs[cpt].set_flag(codepoint_categ::UPPERCASE); + } + + //for (auto &range : unicode_ranges_nfd) { // start, last, nfd + // cpt_categs[cpt].set_flag(codepoint_categ::NORM_NFD); + //} + + return cpt_categs; + }(); + return cp < cpt_categs.size() ? cpt_categs[cp] : codepoint_categ{}; } -codepoint_flags unicode_cpt_flags(const std::string & utf8) { - static const codepoint_flags undef(codepoint_flags::UNDEFINED); +codepoint_categ unicode_cpt_category(const std::string & utf8) { if (utf8.empty()) { - return undef; // undefined + return codepoint_categ{}; // undefined } size_t offset = 0; - return unicode_cpt_flags(unicode_cpt_from_utf8(utf8, offset)); + return unicode_cpt_category(unicode_cpt_from_utf8(utf8, offset)); } std::string unicode_byte_to_utf8(uint8_t byte) { diff --git a/src/unicode.h b/src/unicode.h index f9f4fcc8c..e8928f261 100644 --- a/src/unicode.h +++ b/src/unicode.h @@ -3,6 +3,8 @@ #include #include #include +#include +#include struct codepoint_categ { enum _category : uint16_t { @@ -59,6 +61,18 @@ struct codepoint_categ { inline codepoint_categ(const uint16_t categ=0) : encoded{categ} {} + static codepoint_categ from_index(int index) { + static const std::array table = { + UNDEF, Cc, Cf, Co, Cs, Ll, Lm, Lo, Lt, Lu, Mc, Me, Mn, Nd, Nl, No, Pc, Pd, Pe, Pf, Pi, Po, Ps, Sc, Sk, Sm, So, Zl, Zp, Zs, UNDEF, UNDEF + }; + return (size_t)index < table.size() ? table[index] : table[0]; + } + + inline void set_flag(_flags flags, bool value = true) { + flags = (_flags) (flags & ~SUBMASK); // ignore category bits + encoded = value ? (encoded | flags) : (encoded & ~flags); + } + inline uint8_t get_category() const { return encoded & MASK; } inline uint8_t get_subcategory() const { return encoded & SUBMASK; } @@ -107,6 +121,18 @@ struct codepoint_categ { inline auto is_Zp() const { return (encoded & SUBMASK) == Zp; } inline auto is_Zs() const { return (encoded & SUBMASK) == Zs; } + const char * c_str() const { + static const std::map map = { + {UNDEF, "UNDEF"}, {C, "C"}, {L, "L"}, {M, "M"}, {N, "N"}, {P, "P"}, {S, "S"}, {Z, "Z"}, + {Cc, "Cc"}, {Cf, "Cf"}, {Co, "Co"}, {Cs, "Cs"}, {Ll, "Ll"}, {Lm, "Lm"}, {Lo, "Lo"}, {Lt, "Lt"}, + {Lu, "Lu"}, {Mc, "Mc"}, {Me, "Me"}, {Mn, "Mn"}, {Nd, "Nd"}, {Nl, "Nl"}, {No, "No"}, {Pc, "Pc"}, + {Pd, "Pd"}, {Pe, "Pe"}, {Pf, "Pf"}, {Pi, "Pi"}, {Po, "Po"}, {Ps, "Ps"}, {Sc, "Sc"}, {Sk, "Sk"}, + {Sm, "Sm"}, {So, "So"}, {Zl, "Zl"}, {Zp, "Zp"}, {Zs, "Zs"}, + }; + const auto it = map.find(encoded & SUBMASK); + return it == map.end() ? "INVALID" : it->second; + } + uint16_t encoded; };