Unicode normalization NFD
This commit is contained in:
parent
707a08d06d
commit
641944a3a3
4 changed files with 1858 additions and 3 deletions
|
@ -1,5 +1,6 @@
|
||||||
import regex
|
import regex
|
||||||
import ctypes
|
import ctypes
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
|
||||||
class CoodepointFlags (ctypes.Structure):
|
class CoodepointFlags (ctypes.Structure):
|
||||||
|
@ -32,6 +33,7 @@ codepoint_flags = (CoodepointFlags * MAX_CODEPOINTS)()
|
||||||
table_whitespace = []
|
table_whitespace = []
|
||||||
table_lowercase = []
|
table_lowercase = []
|
||||||
table_uppercase = []
|
table_uppercase = []
|
||||||
|
table_nfd = []
|
||||||
|
|
||||||
for codepoint in range(MAX_CODEPOINTS):
|
for codepoint in range(MAX_CODEPOINTS):
|
||||||
# convert codepoint to unicode character
|
# convert codepoint to unicode character
|
||||||
|
@ -63,14 +65,30 @@ for codepoint in range(MAX_CODEPOINTS):
|
||||||
if codepoint != upper:
|
if codepoint != upper:
|
||||||
table_uppercase.append((codepoint, upper))
|
table_uppercase.append((codepoint, upper))
|
||||||
|
|
||||||
|
# NFD normalization
|
||||||
|
norm = ord(unicodedata.normalize('NFD', char)[0])
|
||||||
|
if codepoint != norm:
|
||||||
|
table_nfd.append((codepoint, norm))
|
||||||
|
|
||||||
ranges_flags = [(0, codepoint_flags[0])]
|
|
||||||
|
# group ranges with same flags
|
||||||
|
ranges_flags = [(0, codepoint_flags[0])] # start, flags
|
||||||
for codepoint, flags in enumerate(codepoint_flags):
|
for codepoint, flags in enumerate(codepoint_flags):
|
||||||
if bytes(flags) != bytes(ranges_flags[-1][1]):
|
if bytes(flags) != bytes(ranges_flags[-1][1]):
|
||||||
ranges_flags.append((codepoint, flags))
|
ranges_flags.append((codepoint, flags))
|
||||||
ranges_flags.append((MAX_CODEPOINTS, CoodepointFlags()))
|
ranges_flags.append((MAX_CODEPOINTS, CoodepointFlags()))
|
||||||
|
|
||||||
|
|
||||||
|
# group ranges with same nfd
|
||||||
|
ranges_nfd = [(0, 0, 0)] # start, last, nfd
|
||||||
|
for codepoint, norm in table_nfd:
|
||||||
|
start = ranges_nfd[-1][0]
|
||||||
|
if norm != ranges_nfd[-1][2]:
|
||||||
|
ranges_nfd.append(None)
|
||||||
|
start = codepoint
|
||||||
|
ranges_nfd[-1] = (start, codepoint, norm)
|
||||||
|
|
||||||
|
|
||||||
# Generate 'unicode-data.cpp'
|
# Generate 'unicode-data.cpp'
|
||||||
|
|
||||||
print("""\
|
print("""\
|
||||||
|
@ -103,3 +121,8 @@ print("const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {")
|
||||||
for tuple in table_uppercase:
|
for tuple in table_uppercase:
|
||||||
print("{0x%06X, 0x%06X}," % tuple)
|
print("{0x%06X, 0x%06X}," % tuple)
|
||||||
print("};\n")
|
print("};\n")
|
||||||
|
|
||||||
|
print("const std::vector<std::tuple<uint32_t, uint32_t, uint32_t>> unicode_ranges_nfd = { // start, last, nfd")
|
||||||
|
for triple in ranges_nfd:
|
||||||
|
print("{0x%06X, 0x%06X, 0x%06X}," % triple)
|
||||||
|
print("};\n")
|
||||||
|
|
1818
unicode-data.cpp
1818
unicode-data.cpp
File diff suppressed because it is too large
Load diff
|
@ -11,3 +11,4 @@ extern const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
|
||||||
extern const std::unordered_set<uint32_t> unicode_set_whitespace;
|
extern const std::unordered_set<uint32_t> unicode_set_whitespace;
|
||||||
extern const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase;
|
extern const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase;
|
||||||
extern const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase;
|
extern const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase;
|
||||||
|
extern const std::vector<std::tuple<uint32_t, uint32_t, uint32_t>> unicode_ranges_nfd;
|
||||||
|
|
17
unicode.cpp
17
unicode.cpp
|
@ -134,6 +134,10 @@ static std::array<codepoint_flags, MAX_CODEPOINTS> unicode_cpt_flags_array() {
|
||||||
cpt_flags[p.second].is_uppercase = true;
|
cpt_flags[p.second].is_uppercase = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (auto &range : unicode_ranges_nfd) { // start, last, nfd
|
||||||
|
cpt_flags[std::get<2>(range)].is_nfd = true;
|
||||||
|
}
|
||||||
|
|
||||||
return cpt_flags;
|
return cpt_flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -576,8 +580,17 @@ std::string unicode_cpt_to_utf8(uint32_t cp) {
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts) {
|
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts) {
|
||||||
(void) cpts;
|
// unicode_ranges_nfd[i] -> tuple(first, last, nfd)
|
||||||
return {}; //####WIP
|
auto comp = +[] (const uint32_t cpt, const decltype(unicode_ranges_nfd)::value_type & triple) {
|
||||||
|
return cpt < std::get<0>(triple);
|
||||||
|
};
|
||||||
|
std::vector<uint32_t> result(cpts.size());
|
||||||
|
for (size_t i = 0; i < cpts.size(); ++i) {
|
||||||
|
const uint32_t cpt = cpts[i];
|
||||||
|
auto it = std::upper_bound(unicode_ranges_nfd.cbegin(), unicode_ranges_nfd.cend(), cpt, comp) - 1;
|
||||||
|
result[i] = (std::get<0>(*it) <= cpt && cpt <= std::get<1>(*it)) ? std::get<2>(*it) : cpt;
|
||||||
|
}
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
|
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue