Unicode data whitespaces as ranges
This commit is contained in:
parent
80f41234e4
commit
7afe6df6a2
4 changed files with 28 additions and 41 deletions
|
@ -85,7 +85,6 @@ UNICODE_CATEGORY_TO_INDEX = {
|
|||
|
||||
|
||||
codepoint_categs = array.array('B', [0]) * MAX_CODEPOINTS # Undefined
|
||||
table_whitespace = []
|
||||
table_lowercase = []
|
||||
table_uppercase = []
|
||||
table_nfd = []
|
||||
|
@ -111,19 +110,20 @@ for (cpt, cpt_lower, cpt_upper, categ, bidir) in unicode_data_iter():
|
|||
table_nfd.append((cpt, norm))
|
||||
|
||||
|
||||
# whitespaces, see "<White_Space>" https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
|
||||
table_whitespace.extend(range(0x0009, 0x000D + 1))
|
||||
table_whitespace.extend(range(0x2000, 0x200A + 1))
|
||||
table_whitespace.extend([0x0020, 0x0085, 0x00A0, 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000])
|
||||
|
||||
|
||||
# sort by codepoint
|
||||
table_whitespace.sort()
|
||||
table_lowercase.sort()
|
||||
table_uppercase.sort()
|
||||
table_nfd.sort()
|
||||
|
||||
|
||||
# whitespaces, see "<White_Space>" https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
|
||||
whitespace_ranges: list[tuple[int, int]] = [] # start, last
|
||||
whitespace_ranges.append((0x0009, 0x000D))
|
||||
whitespace_ranges.append((0x2000, 0x200A))
|
||||
for whitespace in [0x0020, 0x0085, 0x00A0, 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000]:
|
||||
whitespace_ranges.append((whitespace, whitespace))
|
||||
|
||||
|
||||
# run length encoding, see unicode_cpt_category() in unicode.cpp
|
||||
assert (max(UNICODE_CATEGORY_TO_INDEX.values()) < 32)
|
||||
codepoint_categs_runs = [codepoint_categs[0]] # 5 bits categ + 11 bits length
|
||||
|
@ -162,7 +162,6 @@ out("""\
|
|||
#include <cstdint>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
""")
|
||||
|
||||
out("const std::vector<uint16_t> unicode_rle_codepoints_categs = { // run length encoding, 5 bits categ + 11 bits length")
|
||||
|
@ -170,9 +169,9 @@ for rle in codepoint_categs_runs:
|
|||
out("0x%04X," % rle)
|
||||
out("};\n")
|
||||
|
||||
out("const std::vector<uint32_t> unicode_vec_whitespace = {")
|
||||
for codepoint in table_whitespace:
|
||||
out("0x%06X," % codepoint)
|
||||
out("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace = {")
|
||||
for (start, last) in whitespace_ranges:
|
||||
out("{0x%06X, 0x%06X}," % (start, last))
|
||||
out("};\n")
|
||||
|
||||
out("const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {")
|
||||
|
|
|
@ -4526,32 +4526,18 @@ const std::vector<uint16_t> unicode_rle_codepoints_categs = { // run length enc
|
|||
0x0020,
|
||||
};
|
||||
|
||||
const std::vector<uint32_t> unicode_vec_whitespace = {
|
||||
0x000009,
|
||||
0x00000A,
|
||||
0x00000B,
|
||||
0x00000C,
|
||||
0x00000D,
|
||||
0x000020,
|
||||
0x000085,
|
||||
0x0000A0,
|
||||
0x001680,
|
||||
0x002000,
|
||||
0x002001,
|
||||
0x002002,
|
||||
0x002003,
|
||||
0x002004,
|
||||
0x002005,
|
||||
0x002006,
|
||||
0x002007,
|
||||
0x002008,
|
||||
0x002009,
|
||||
0x00200A,
|
||||
0x002028,
|
||||
0x002029,
|
||||
0x00202F,
|
||||
0x00205F,
|
||||
0x003000,
|
||||
const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace = {
|
||||
{0x000009, 0x00000D},
|
||||
{0x002000, 0x00200A},
|
||||
{0x000020, 0x000020},
|
||||
{0x000085, 0x000085},
|
||||
{0x0000A0, 0x0000A0},
|
||||
{0x001680, 0x001680},
|
||||
{0x002028, 0x002028},
|
||||
{0x002029, 0x002029},
|
||||
{0x00202F, 0x00202F},
|
||||
{0x00205F, 0x00205F},
|
||||
{0x003000, 0x003000},
|
||||
};
|
||||
|
||||
const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {
|
||||
|
|
|
@ -13,7 +13,7 @@ struct range_nfd {
|
|||
static const uint32_t MAX_CODEPOINTS = 0x110000;
|
||||
|
||||
extern const std::vector<uint16_t> unicode_rle_codepoints_categs;
|
||||
extern const std::vector<uint32_t> unicode_vec_whitespace;
|
||||
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace;
|
||||
extern const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase;
|
||||
extern const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase;
|
||||
extern const std::vector<range_nfd> unicode_ranges_nfd;
|
||||
|
|
|
@ -597,8 +597,10 @@ codepoint_categ unicode_cpt_category(const uint32_t cp) {
|
|||
}
|
||||
GGML_ASSERT(cpt == MAX_CODEPOINTS);
|
||||
|
||||
for (auto cpt : unicode_vec_whitespace) {
|
||||
cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACE);
|
||||
for (auto p : unicode_ranges_whitespace) {
|
||||
for (uint32_t cpt = p.first; cpt <= p.second; ++cpt) {
|
||||
cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACE);
|
||||
}
|
||||
}
|
||||
|
||||
for (auto p : unicode_map_lowercase) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue