feat: first iteration NFC

This commit is contained in:
Joan Martinez 2024-05-06 16:04:01 +02:00
parent 14cd69a87d
commit d5c3525bff
4 changed files with 91 additions and 0 deletions

View file

@ -1649,3 +1649,16 @@ const std::map<char32_t, char32_t> unicode_map_lowercase = {
{0x1E917, 0x1E939}, {0x1E918, 0x1E93A}, {0x1E919, 0x1E93B}, {0x1E91A, 0x1E93C}, {0x1E91B, 0x1E93D}, {0x1E91C, 0x1E93E},
{0x1E91D, 0x1E93F}, {0x1E91E, 0x1E940}, {0x1E91F, 0x1E941}, {0x1E920, 0x1E942}, {0x1E921, 0x1E943},
};
const std::unordered_map<uint32_t, std::vector<uint32_t>> unicode_decompose_map = {
{65, {65, 769}}, // Example: Unicode point A decomposes into A + combining acute accent
{231, {99, 807}} // Example: Unicode point ç decomposes into c + combining cedilla
};
const std::unordered_map<uint32_t, uint32_t> unicode_canonical_class = {
{65, 0}, // Example: Unicode point A has canonical class 0
{769, 1}, // Example: Combining acute accent has canonical class 1
{99, 0}, // Example: Unicode point c has canonical class 0
{807, 1} // Example: Combining cedilla has canonical class 1
};

View file

@ -4,6 +4,7 @@
#include <map>
#include <utility>
#include <vector>
#include <unordered_map>
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_digit;
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
@ -14,3 +15,5 @@ extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_symbol;
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_control;
extern const std::multimap<uint32_t, uint32_t> unicode_map_nfd;
extern const std::map<char32_t, char32_t> unicode_map_lowercase;
extern const std::unordered_map<uint32_t, std::vector<uint32_t>> unicode_decompose_map;
extern const std::unordered_map<uint32_t, uint32_t> unicode_canonical_class;

View file

@ -13,6 +13,8 @@
#include <vector>
#include <locale>
#include <codecvt>
#include <unicode/unistr.h>
#include <unicode/unorm2.h>
static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
std::string result;
@ -469,6 +471,68 @@ std::string unicode_cpt_to_utf8(uint32_t cp) {
throw std::invalid_argument("invalid codepoint");
}
// Function to recursively decompose a string
std::vector<uint32_t> decompose_cpts(const std::vector<uint32_t> & cpts) {
std::vector<uint32_t> result;
for (const auto& cpt : cpts) {
auto it = unicode_decompose_map.find(cpt);
if (it != unicode_decompose_map.end()) {
for (const auto& decomp: it->second) {
const auto & inner_result = decompose_cpts({decomp});
result.insert(result.end(), inner_result.begin(), inner_result.end());
}
} else {
result.push_back(cpt);
}
}
return result;
}
// Function to sort subsequences based on canonical class
std::vector<uint32_t> sort_by_canonical_class(const std::vector<uint32_t> & cpts) {
std::vector<uint32_t> subsequence;
std::vector<uint32_t> result;
auto compareByCanonicalClass = [&](const uint32_t& a, const uint32_t& b) {
auto cc_a_it = unicode_canonical_class.find(a);
if (cc_a_it != unicode_canonical_class.end()) {
auto cc_b_it = unicode_canonical_class.find(b);
if (cc_b_it != unicode_canonical_class.end()) {
return cc_a_it->second < cc_b_it->second;
}
}
return false;
};
for (const auto& cpt : cpts) {
auto it = unicode_canonical_class.find(cpt);
if (it != unicode_canonical_class.end()) {
if (it->second > 0) {
subsequence.push_back(cpt);
} else {
if (!subsequence.empty()) {
sort(subsequence.begin(), subsequence.end(), compareByCanonicalClass);
for (const auto& codepoint : subsequence) {
result.push_back(codepoint);
}
subsequence.clear();
}
result.push_back(cpt);
}
}
}
if (!subsequence.empty()) {
sort(subsequence.begin(), subsequence.end(), compareByCanonicalClass);
for (const auto& codepoint : subsequence) {
result.push_back(codepoint);
}
}
return result;
}
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts) {
std::vector<uint32_t> result;
result.reserve(cpts.size());
@ -483,6 +547,14 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
return result;
}
std::vector<uint32_t> unicode_cpts_normalize_nfc(const std::vector<uint32_t> & cpts) {
const auto &decomposed_cpts = decompose_cpts(cpts);
const auto &sorted_sequence = sort_by_canonical_class(decomposed_cpts);
//TODO: Do canonical composition
return sorted_sequence;
}
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
std::vector<uint32_t> result;
size_t offset = 0;

View file

@ -17,6 +17,9 @@ std::string unicode_cpt_to_utf8(uint32_t cp);
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
std::vector<uint32_t> unicode_cpts_normalize_nfc(const std::vector<uint32_t> & cpts);
std::vector<uint32_t> decompose_cpts(const std::vector<uint32_t> & cpts);
std::vector<uint32_t> sort_by_canonical_class(const std::vector<uint32_t> & cpts);
int unicode_cpt_type(uint32_t cp);
int unicode_cpt_type(const std::string & utf8);