diff --git a/scripts/gen-unicode-data.py b/scripts/gen-unicode-data.py index d774fcabe..1528a13db 100644 --- a/scripts/gen-unicode-data.py +++ b/scripts/gen-unicode-data.py @@ -85,7 +85,6 @@ UNICODE_CATEGORY_TO_INDEX = { codepoint_categs = array.array('B', [0]) * MAX_CODEPOINTS # Undefined -table_whitespace = [] table_lowercase = [] table_uppercase = [] table_nfd = [] @@ -111,19 +110,20 @@ for (cpt, cpt_lower, cpt_upper, categ, bidir) in unicode_data_iter(): table_nfd.append((cpt, norm)) -# whitespaces, see "" https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt -table_whitespace.extend(range(0x0009, 0x000D + 1)) -table_whitespace.extend(range(0x2000, 0x200A + 1)) -table_whitespace.extend([0x0020, 0x0085, 0x00A0, 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000]) - - # sort by codepoint -table_whitespace.sort() table_lowercase.sort() table_uppercase.sort() table_nfd.sort() +# whitespaces, see "" https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt +whitespace_ranges: list[tuple[int, int]] = [] # start, last +whitespace_ranges.append((0x0009, 0x000D)) +whitespace_ranges.append((0x2000, 0x200A)) +for whitespace in [0x0020, 0x0085, 0x00A0, 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000]: + whitespace_ranges.append((whitespace, whitespace)) + + # run length encoding, see unicode_cpt_category() in unicode.cpp assert (max(UNICODE_CATEGORY_TO_INDEX.values()) < 32) codepoint_categs_runs = [codepoint_categs[0]] # 5 bits categ + 11 bits length @@ -162,7 +162,6 @@ out("""\ #include #include #include -#include """) out("const std::vector unicode_rle_codepoints_categs = { // run length encoding, 5 bits categ + 11 bits length") @@ -170,9 +169,9 @@ for rle in codepoint_categs_runs: out("0x%04X," % rle) out("};\n") -out("const std::vector unicode_vec_whitespace = {") -for codepoint in table_whitespace: - out("0x%06X," % codepoint) +out("const std::vector> unicode_ranges_whitespace = {") +for (start, last) in whitespace_ranges: + out("{0x%06X, 0x%06X}," % (start, last)) out("};\n") out("const std::unordered_map unicode_map_lowercase = {") diff --git a/src/unicode-data.cpp b/src/unicode-data.cpp index 2591723ce..1a2ceb017 100644 --- a/src/unicode-data.cpp +++ b/src/unicode-data.cpp @@ -4526,32 +4526,18 @@ const std::vector unicode_rle_codepoints_categs = { // run length enc 0x0020, }; -const std::vector unicode_vec_whitespace = { -0x000009, -0x00000A, -0x00000B, -0x00000C, -0x00000D, -0x000020, -0x000085, -0x0000A0, -0x001680, -0x002000, -0x002001, -0x002002, -0x002003, -0x002004, -0x002005, -0x002006, -0x002007, -0x002008, -0x002009, -0x00200A, -0x002028, -0x002029, -0x00202F, -0x00205F, -0x003000, +const std::vector> unicode_ranges_whitespace = { +{0x000009, 0x00000D}, +{0x002000, 0x00200A}, +{0x000020, 0x000020}, +{0x000085, 0x000085}, +{0x0000A0, 0x0000A0}, +{0x001680, 0x001680}, +{0x002028, 0x002028}, +{0x002029, 0x002029}, +{0x00202F, 0x00202F}, +{0x00205F, 0x00205F}, +{0x003000, 0x003000}, }; const std::unordered_map unicode_map_lowercase = { diff --git a/src/unicode-data.h b/src/unicode-data.h index 682f79c37..447826879 100644 --- a/src/unicode-data.h +++ b/src/unicode-data.h @@ -13,7 +13,7 @@ struct range_nfd { static const uint32_t MAX_CODEPOINTS = 0x110000; extern const std::vector unicode_rle_codepoints_categs; -extern const std::vector unicode_vec_whitespace; +extern const std::vector> unicode_ranges_whitespace; extern const std::unordered_map unicode_map_lowercase; extern const std::unordered_map unicode_map_uppercase; extern const std::vector unicode_ranges_nfd; diff --git a/src/unicode.cpp b/src/unicode.cpp index 725476600..6ebef0ec9 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -597,8 +597,10 @@ codepoint_categ unicode_cpt_category(const uint32_t cp) { } GGML_ASSERT(cpt == MAX_CODEPOINTS); - for (auto cpt : unicode_vec_whitespace) { - cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACE); + for (auto p : unicode_ranges_whitespace) { + for (uint32_t cpt = p.first; cpt <= p.second; ++cpt) { + cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACE); + } } for (auto p : unicode_map_lowercase) {