Unicode normalization NFD
This commit is contained in:
parent
707a08d06d
commit
641944a3a3
4 changed files with 1858 additions and 3 deletions
|
@ -1,5 +1,6 @@
|
|||
import regex
|
||||
import ctypes
|
||||
import unicodedata
|
||||
|
||||
|
||||
class CoodepointFlags (ctypes.Structure):
|
||||
|
@ -32,6 +33,7 @@ codepoint_flags = (CoodepointFlags * MAX_CODEPOINTS)()
|
|||
table_whitespace = []
|
||||
table_lowercase = []
|
||||
table_uppercase = []
|
||||
table_nfd = []
|
||||
|
||||
for codepoint in range(MAX_CODEPOINTS):
|
||||
# convert codepoint to unicode character
|
||||
|
@ -63,14 +65,30 @@ for codepoint in range(MAX_CODEPOINTS):
|
|||
if codepoint != upper:
|
||||
table_uppercase.append((codepoint, upper))
|
||||
|
||||
# NFD normalization
|
||||
norm = ord(unicodedata.normalize('NFD', char)[0])
|
||||
if codepoint != norm:
|
||||
table_nfd.append((codepoint, norm))
|
||||
|
||||
ranges_flags = [(0, codepoint_flags[0])]
|
||||
|
||||
# group ranges with same flags
|
||||
ranges_flags = [(0, codepoint_flags[0])] # start, flags
|
||||
for codepoint, flags in enumerate(codepoint_flags):
|
||||
if bytes(flags) != bytes(ranges_flags[-1][1]):
|
||||
ranges_flags.append((codepoint, flags))
|
||||
ranges_flags.append((MAX_CODEPOINTS, CoodepointFlags()))
|
||||
|
||||
|
||||
# group ranges with same nfd
|
||||
ranges_nfd = [(0, 0, 0)] # start, last, nfd
|
||||
for codepoint, norm in table_nfd:
|
||||
start = ranges_nfd[-1][0]
|
||||
if norm != ranges_nfd[-1][2]:
|
||||
ranges_nfd.append(None)
|
||||
start = codepoint
|
||||
ranges_nfd[-1] = (start, codepoint, norm)
|
||||
|
||||
|
||||
# Generate 'unicode-data.cpp'
|
||||
|
||||
print("""\
|
||||
|
@ -103,3 +121,8 @@ print("const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {")
|
|||
for tuple in table_uppercase:
|
||||
print("{0x%06X, 0x%06X}," % tuple)
|
||||
print("};\n")
|
||||
|
||||
print("const std::vector<std::tuple<uint32_t, uint32_t, uint32_t>> unicode_ranges_nfd = { // start, last, nfd")
|
||||
for triple in ranges_nfd:
|
||||
print("{0x%06X, 0x%06X, 0x%06X}," % triple)
|
||||
print("};\n")
|
||||
|
|
1818
unicode-data.cpp
1818
unicode-data.cpp
File diff suppressed because it is too large
Load diff
|
@ -11,3 +11,4 @@ extern const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
|
|||
extern const std::unordered_set<uint32_t> unicode_set_whitespace;
|
||||
extern const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase;
|
||||
extern const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase;
|
||||
extern const std::vector<std::tuple<uint32_t, uint32_t, uint32_t>> unicode_ranges_nfd;
|
||||
|
|
17
unicode.cpp
17
unicode.cpp
|
@ -134,6 +134,10 @@ static std::array<codepoint_flags, MAX_CODEPOINTS> unicode_cpt_flags_array() {
|
|||
cpt_flags[p.second].is_uppercase = true;
|
||||
}
|
||||
|
||||
for (auto &range : unicode_ranges_nfd) { // start, last, nfd
|
||||
cpt_flags[std::get<2>(range)].is_nfd = true;
|
||||
}
|
||||
|
||||
return cpt_flags;
|
||||
}
|
||||
|
||||
|
@ -576,8 +580,17 @@ std::string unicode_cpt_to_utf8(uint32_t cp) {
|
|||
}
|
||||
|
||||
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts) {
|
||||
(void) cpts;
|
||||
return {}; //####WIP
|
||||
// unicode_ranges_nfd[i] -> tuple(first, last, nfd)
|
||||
auto comp = +[] (const uint32_t cpt, const decltype(unicode_ranges_nfd)::value_type & triple) {
|
||||
return cpt < std::get<0>(triple);
|
||||
};
|
||||
std::vector<uint32_t> result(cpts.size());
|
||||
for (size_t i = 0; i < cpts.size(); ++i) {
|
||||
const uint32_t cpt = cpts[i];
|
||||
auto it = std::upper_bound(unicode_ranges_nfd.cbegin(), unicode_ranges_nfd.cend(), cpt, comp) - 1;
|
||||
result[i] = (std::get<0>(*it) <= cpt && cpt <= std::get<1>(*it)) ? std::get<2>(*it) : cpt;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue