Unicode normalization NFD

2024-05-13 01:18:51 +02:00 · 2024-05-13 01:18:51 +02:00 · 641944a3a3
commit 641944a3a3
parent 707a08d06d
4 changed files with 1858 additions and 3 deletions
--- a/scripts/gen-unicode-data.py
+++ b/scripts/gen-unicode-data.py
@ -1,5 +1,6 @@
 import regex
 import ctypes
+import unicodedata


 class CoodepointFlags (ctypes.Structure):
@ -32,6 +33,7 @@ codepoint_flags = (CoodepointFlags * MAX_CODEPOINTS)()
 table_whitespace = []
 table_lowercase = []
 table_uppercase = []
+table_nfd = []

 for codepoint in range(MAX_CODEPOINTS):
    # convert codepoint to unicode character
@ -63,14 +65,30 @@ for codepoint in range(MAX_CODEPOINTS):
    if codepoint != upper:
        table_uppercase.append((codepoint, upper))

+    # NFD normalization
+    norm = ord(unicodedata.normalize('NFD', char)[0])
+    if codepoint != norm:
+        table_nfd.append((codepoint, norm))

-ranges_flags = [(0, codepoint_flags[0])]
+
+# group ranges with same flags
+ranges_flags = [(0, codepoint_flags[0])] # start, flags
 for codepoint, flags in enumerate(codepoint_flags):
    if bytes(flags) != bytes(ranges_flags[-1][1]):
        ranges_flags.append((codepoint, flags))
 ranges_flags.append((MAX_CODEPOINTS, CoodepointFlags()))


+# group ranges with same nfd
+ranges_nfd = [(0, 0, 0)] # start, last, nfd
+for codepoint, norm in table_nfd:
+    start = ranges_nfd[-1][0]
+    if norm != ranges_nfd[-1][2]:
+        ranges_nfd.append(None)
+        start = codepoint
+    ranges_nfd[-1] = (start, codepoint, norm)
+
+
 # Generate 'unicode-data.cpp'

 print("""\
@ -103,3 +121,8 @@ print("const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {")
 for tuple in table_uppercase:
    print("{0x%06X, 0x%06X}," % tuple)
 print("};\n")
+
+print("const std::vector<std::tuple<uint32_t, uint32_t, uint32_t>> unicode_ranges_nfd = {  // start, last, nfd")
+for triple in ranges_nfd:
+    print("{0x%06X, 0x%06X, 0x%06X}," % triple)
+print("};\n")
--- a/unicode-data.cpp
+++ b/unicode-data.cpp
--- a/unicode-data.h
+++ b/unicode-data.h
@ -11,3 +11,4 @@ extern const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
 extern const std::unordered_set<uint32_t> unicode_set_whitespace;
 extern const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase;
 extern const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase;
+extern const std::vector<std::tuple<uint32_t, uint32_t, uint32_t>> unicode_ranges_nfd;
--- a/unicode.cpp
+++ b/unicode.cpp
@ -134,6 +134,10 @@ static std::array<codepoint_flags, MAX_CODEPOINTS> unicode_cpt_flags_array() {
        cpt_flags[p.second].is_uppercase = true;
    }

+    for (auto &range : unicode_ranges_nfd) {  // start, last, nfd
+        cpt_flags[std::get<2>(range)].is_nfd = true;
+    }
+
    return cpt_flags;
 }

@ -576,8 +580,17 @@ std::string unicode_cpt_to_utf8(uint32_t cp) {
 }

 std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts) {
-    (void) cpts;
-    return {};  //####WIP
+    // unicode_ranges_nfd[i] -> tuple(first, last, nfd)
+    auto comp = +[] (const uint32_t cpt, const decltype(unicode_ranges_nfd)::value_type & triple) {
+        return cpt < std::get<0>(triple);
+    };
+    std::vector<uint32_t> result(cpts.size());
+    for (size_t i = 0; i < cpts.size(); ++i) {
+        const uint32_t cpt = cpts[i];
+        auto it = std::upper_bound(unicode_ranges_nfd.cbegin(), unicode_ranges_nfd.cend(), cpt, comp) - 1;
+        result[i] = (std::get<0>(*it) <= cpt && cpt <= std::get<1>(*it)) ? std::get<2>(*it) : cpt;
+    }
+    return result;
 }

 std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {