feat: first iteration NFC
This commit is contained in:
parent
14cd69a87d
commit
d5c3525bff
4 changed files with 91 additions and 0 deletions
|
@ -1649,3 +1649,16 @@ const std::map<char32_t, char32_t> unicode_map_lowercase = {
|
|||
{0x1E917, 0x1E939}, {0x1E918, 0x1E93A}, {0x1E919, 0x1E93B}, {0x1E91A, 0x1E93C}, {0x1E91B, 0x1E93D}, {0x1E91C, 0x1E93E},
|
||||
{0x1E91D, 0x1E93F}, {0x1E91E, 0x1E940}, {0x1E91F, 0x1E941}, {0x1E920, 0x1E942}, {0x1E921, 0x1E943},
|
||||
};
|
||||
|
||||
|
||||
const std::unordered_map<uint32_t, std::vector<uint32_t>> unicode_decompose_map = {
|
||||
{65, {65, 769}}, // Example: Unicode point A decomposes into A + combining acute accent
|
||||
{231, {99, 807}} // Example: Unicode point ç decomposes into c + combining cedilla
|
||||
};
|
||||
|
||||
const std::unordered_map<uint32_t, uint32_t> unicode_canonical_class = {
|
||||
{65, 0}, // Example: Unicode point A has canonical class 0
|
||||
{769, 1}, // Example: Combining acute accent has canonical class 1
|
||||
{99, 0}, // Example: Unicode point c has canonical class 0
|
||||
{807, 1} // Example: Combining cedilla has canonical class 1
|
||||
};
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
#include <map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
|
||||
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_digit;
|
||||
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
|
||||
|
@ -14,3 +15,5 @@ extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_symbol;
|
|||
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_control;
|
||||
extern const std::multimap<uint32_t, uint32_t> unicode_map_nfd;
|
||||
extern const std::map<char32_t, char32_t> unicode_map_lowercase;
|
||||
extern const std::unordered_map<uint32_t, std::vector<uint32_t>> unicode_decompose_map;
|
||||
extern const std::unordered_map<uint32_t, uint32_t> unicode_canonical_class;
|
||||
|
|
72
unicode.cpp
72
unicode.cpp
|
@ -13,6 +13,8 @@
|
|||
#include <vector>
|
||||
#include <locale>
|
||||
#include <codecvt>
|
||||
#include <unicode/unistr.h>
|
||||
#include <unicode/unorm2.h>
|
||||
|
||||
static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
|
||||
std::string result;
|
||||
|
@ -469,6 +471,68 @@ std::string unicode_cpt_to_utf8(uint32_t cp) {
|
|||
throw std::invalid_argument("invalid codepoint");
|
||||
}
|
||||
|
||||
// Function to recursively decompose a string
|
||||
std::vector<uint32_t> decompose_cpts(const std::vector<uint32_t> & cpts) {
|
||||
std::vector<uint32_t> result;
|
||||
for (const auto& cpt : cpts) {
|
||||
auto it = unicode_decompose_map.find(cpt);
|
||||
if (it != unicode_decompose_map.end()) {
|
||||
for (const auto& decomp: it->second) {
|
||||
const auto & inner_result = decompose_cpts({decomp});
|
||||
result.insert(result.end(), inner_result.begin(), inner_result.end());
|
||||
}
|
||||
} else {
|
||||
result.push_back(cpt);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// Function to sort subsequences based on canonical class
|
||||
std::vector<uint32_t> sort_by_canonical_class(const std::vector<uint32_t> & cpts) {
|
||||
std::vector<uint32_t> subsequence;
|
||||
std::vector<uint32_t> result;
|
||||
auto compareByCanonicalClass = [&](const uint32_t& a, const uint32_t& b) {
|
||||
auto cc_a_it = unicode_canonical_class.find(a);
|
||||
if (cc_a_it != unicode_canonical_class.end()) {
|
||||
auto cc_b_it = unicode_canonical_class.find(b);
|
||||
if (cc_b_it != unicode_canonical_class.end()) {
|
||||
return cc_a_it->second < cc_b_it->second;
|
||||
}
|
||||
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
for (const auto& cpt : cpts) {
|
||||
auto it = unicode_canonical_class.find(cpt);
|
||||
if (it != unicode_canonical_class.end()) {
|
||||
if (it->second > 0) {
|
||||
subsequence.push_back(cpt);
|
||||
} else {
|
||||
if (!subsequence.empty()) {
|
||||
sort(subsequence.begin(), subsequence.end(), compareByCanonicalClass);
|
||||
for (const auto& codepoint : subsequence) {
|
||||
result.push_back(codepoint);
|
||||
}
|
||||
subsequence.clear();
|
||||
}
|
||||
|
||||
result.push_back(cpt);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!subsequence.empty()) {
|
||||
sort(subsequence.begin(), subsequence.end(), compareByCanonicalClass);
|
||||
for (const auto& codepoint : subsequence) {
|
||||
result.push_back(codepoint);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts) {
|
||||
std::vector<uint32_t> result;
|
||||
result.reserve(cpts.size());
|
||||
|
@ -483,6 +547,14 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
|
|||
return result;
|
||||
}
|
||||
|
||||
|
||||
std::vector<uint32_t> unicode_cpts_normalize_nfc(const std::vector<uint32_t> & cpts) {
|
||||
const auto &decomposed_cpts = decompose_cpts(cpts);
|
||||
const auto &sorted_sequence = sort_by_canonical_class(decomposed_cpts);
|
||||
//TODO: Do canonical composition
|
||||
return sorted_sequence;
|
||||
}
|
||||
|
||||
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
|
||||
std::vector<uint32_t> result;
|
||||
size_t offset = 0;
|
||||
|
|
|
@ -17,6 +17,9 @@ std::string unicode_cpt_to_utf8(uint32_t cp);
|
|||
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
|
||||
|
||||
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
|
||||
std::vector<uint32_t> unicode_cpts_normalize_nfc(const std::vector<uint32_t> & cpts);
|
||||
std::vector<uint32_t> decompose_cpts(const std::vector<uint32_t> & cpts);
|
||||
std::vector<uint32_t> sort_by_canonical_class(const std::vector<uint32_t> & cpts);
|
||||
|
||||
int unicode_cpt_type(uint32_t cp);
|
||||
int unicode_cpt_type(const std::string & utf8);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue