Unicode normalization NFD

This commit is contained in:
jaime-m-p 2024-05-13 01:18:51 +02:00
parent 707a08d06d
commit 641944a3a3
4 changed files with 1858 additions and 3 deletions

View file

@ -1,5 +1,6 @@
import regex
import ctypes
import unicodedata
class CoodepointFlags (ctypes.Structure):
@ -32,6 +33,7 @@ codepoint_flags = (CoodepointFlags * MAX_CODEPOINTS)()
table_whitespace = []
table_lowercase = []
table_uppercase = []
table_nfd = []
for codepoint in range(MAX_CODEPOINTS):
# convert codepoint to unicode character
@ -63,14 +65,30 @@ for codepoint in range(MAX_CODEPOINTS):
if codepoint != upper:
table_uppercase.append((codepoint, upper))
# NFD normalization
norm = ord(unicodedata.normalize('NFD', char)[0])
if codepoint != norm:
table_nfd.append((codepoint, norm))
ranges_flags = [(0, codepoint_flags[0])]
# group ranges with same flags
ranges_flags = [(0, codepoint_flags[0])] # start, flags
for codepoint, flags in enumerate(codepoint_flags):
if bytes(flags) != bytes(ranges_flags[-1][1]):
ranges_flags.append((codepoint, flags))
ranges_flags.append((MAX_CODEPOINTS, CoodepointFlags()))
# group ranges with same nfd
ranges_nfd = [(0, 0, 0)] # start, last, nfd
for codepoint, norm in table_nfd:
start = ranges_nfd[-1][0]
if norm != ranges_nfd[-1][2]:
ranges_nfd.append(None)
start = codepoint
ranges_nfd[-1] = (start, codepoint, norm)
# Generate 'unicode-data.cpp'
print("""\
@ -103,3 +121,8 @@ print("const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {")
for tuple in table_uppercase:
print("{0x%06X, 0x%06X}," % tuple)
print("};\n")
print("const std::vector<std::tuple<uint32_t, uint32_t, uint32_t>> unicode_ranges_nfd = { // start, last, nfd")
for triple in ranges_nfd:
print("{0x%06X, 0x%06X, 0x%06X}," % triple)
print("};\n")

File diff suppressed because it is too large Load diff

View file

@ -11,3 +11,4 @@ extern const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
extern const std::unordered_set<uint32_t> unicode_set_whitespace;
extern const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase;
extern const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase;
extern const std::vector<std::tuple<uint32_t, uint32_t, uint32_t>> unicode_ranges_nfd;

View file

@ -134,6 +134,10 @@ static std::array<codepoint_flags, MAX_CODEPOINTS> unicode_cpt_flags_array() {
cpt_flags[p.second].is_uppercase = true;
}
for (auto &range : unicode_ranges_nfd) { // start, last, nfd
cpt_flags[std::get<2>(range)].is_nfd = true;
}
return cpt_flags;
}
@ -576,8 +580,17 @@ std::string unicode_cpt_to_utf8(uint32_t cp) {
}
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts) {
(void) cpts;
return {}; //####WIP
// unicode_ranges_nfd[i] -> tuple(first, last, nfd)
auto comp = +[] (const uint32_t cpt, const decltype(unicode_ranges_nfd)::value_type & triple) {
return cpt < std::get<0>(triple);
};
std::vector<uint32_t> result(cpts.size());
for (size_t i = 0; i < cpts.size(); ++i) {
const uint32_t cpt = cpts[i];
auto it = std::upper_bound(unicode_ranges_nfd.cbegin(), unicode_ranges_nfd.cend(), cpt, comp) - 1;
result[i] = (std::get<0>(*it) <= cpt && cpt <= std::get<1>(*it)) ? std::get<2>(*it) : cpt;
}
return result;
}
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {