Unicode data whitespaces as ranges

This commit is contained in:
jaime-m-p 2024-08-07 23:14:36 +02:00
parent 80f41234e4
commit 7afe6df6a2
4 changed files with 28 additions and 41 deletions

View file

@ -85,7 +85,6 @@ UNICODE_CATEGORY_TO_INDEX = {
codepoint_categs = array.array('B', [0]) * MAX_CODEPOINTS # Undefined codepoint_categs = array.array('B', [0]) * MAX_CODEPOINTS # Undefined
table_whitespace = []
table_lowercase = [] table_lowercase = []
table_uppercase = [] table_uppercase = []
table_nfd = [] table_nfd = []
@ -111,19 +110,20 @@ for (cpt, cpt_lower, cpt_upper, categ, bidir) in unicode_data_iter():
table_nfd.append((cpt, norm)) table_nfd.append((cpt, norm))
# whitespaces, see "<White_Space>" https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
table_whitespace.extend(range(0x0009, 0x000D + 1))
table_whitespace.extend(range(0x2000, 0x200A + 1))
table_whitespace.extend([0x0020, 0x0085, 0x00A0, 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000])
# sort by codepoint # sort by codepoint
table_whitespace.sort()
table_lowercase.sort() table_lowercase.sort()
table_uppercase.sort() table_uppercase.sort()
table_nfd.sort() table_nfd.sort()
# whitespaces, see "<White_Space>" https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
whitespace_ranges: list[tuple[int, int]] = [] # start, last
whitespace_ranges.append((0x0009, 0x000D))
whitespace_ranges.append((0x2000, 0x200A))
for whitespace in [0x0020, 0x0085, 0x00A0, 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000]:
whitespace_ranges.append((whitespace, whitespace))
# run length encoding, see unicode_cpt_category() in unicode.cpp # run length encoding, see unicode_cpt_category() in unicode.cpp
assert (max(UNICODE_CATEGORY_TO_INDEX.values()) < 32) assert (max(UNICODE_CATEGORY_TO_INDEX.values()) < 32)
codepoint_categs_runs = [codepoint_categs[0]] # 5 bits categ + 11 bits length codepoint_categs_runs = [codepoint_categs[0]] # 5 bits categ + 11 bits length
@ -162,7 +162,6 @@ out("""\
#include <cstdint> #include <cstdint>
#include <vector> #include <vector>
#include <unordered_map> #include <unordered_map>
#include <unordered_set>
""") """)
out("const std::vector<uint16_t> unicode_rle_codepoints_categs = { // run length encoding, 5 bits categ + 11 bits length") out("const std::vector<uint16_t> unicode_rle_codepoints_categs = { // run length encoding, 5 bits categ + 11 bits length")
@ -170,9 +169,9 @@ for rle in codepoint_categs_runs:
out("0x%04X," % rle) out("0x%04X," % rle)
out("};\n") out("};\n")
out("const std::vector<uint32_t> unicode_vec_whitespace = {") out("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace = {")
for codepoint in table_whitespace: for (start, last) in whitespace_ranges:
out("0x%06X," % codepoint) out("{0x%06X, 0x%06X}," % (start, last))
out("};\n") out("};\n")
out("const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {") out("const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {")

View file

@ -4526,32 +4526,18 @@ const std::vector<uint16_t> unicode_rle_codepoints_categs = { // run length enc
0x0020, 0x0020,
}; };
const std::vector<uint32_t> unicode_vec_whitespace = { const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace = {
0x000009, {0x000009, 0x00000D},
0x00000A, {0x002000, 0x00200A},
0x00000B, {0x000020, 0x000020},
0x00000C, {0x000085, 0x000085},
0x00000D, {0x0000A0, 0x0000A0},
0x000020, {0x001680, 0x001680},
0x000085, {0x002028, 0x002028},
0x0000A0, {0x002029, 0x002029},
0x001680, {0x00202F, 0x00202F},
0x002000, {0x00205F, 0x00205F},
0x002001, {0x003000, 0x003000},
0x002002,
0x002003,
0x002004,
0x002005,
0x002006,
0x002007,
0x002008,
0x002009,
0x00200A,
0x002028,
0x002029,
0x00202F,
0x00205F,
0x003000,
}; };
const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = { const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {

View file

@ -13,7 +13,7 @@ struct range_nfd {
static const uint32_t MAX_CODEPOINTS = 0x110000; static const uint32_t MAX_CODEPOINTS = 0x110000;
extern const std::vector<uint16_t> unicode_rle_codepoints_categs; extern const std::vector<uint16_t> unicode_rle_codepoints_categs;
extern const std::vector<uint32_t> unicode_vec_whitespace; extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace;
extern const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase; extern const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase;
extern const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase; extern const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase;
extern const std::vector<range_nfd> unicode_ranges_nfd; extern const std::vector<range_nfd> unicode_ranges_nfd;

View file

@ -597,8 +597,10 @@ codepoint_categ unicode_cpt_category(const uint32_t cp) {
} }
GGML_ASSERT(cpt == MAX_CODEPOINTS); GGML_ASSERT(cpt == MAX_CODEPOINTS);
for (auto cpt : unicode_vec_whitespace) { for (auto p : unicode_ranges_whitespace) {
cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACE); for (uint32_t cpt = p.first; cpt <= p.second; ++cpt) {
cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACE);
}
} }
for (auto p : unicode_map_lowercase) { for (auto p : unicode_map_lowercase) {