diff --git a/unicode-data.cpp b/unicode-data.cpp index e6bafb3a9..c3df7a1d5 100644 --- a/unicode-data.cpp +++ b/unicode-data.cpp @@ -1649,3 +1649,16 @@ const std::map unicode_map_lowercase = { {0x1E917, 0x1E939}, {0x1E918, 0x1E93A}, {0x1E919, 0x1E93B}, {0x1E91A, 0x1E93C}, {0x1E91B, 0x1E93D}, {0x1E91C, 0x1E93E}, {0x1E91D, 0x1E93F}, {0x1E91E, 0x1E940}, {0x1E91F, 0x1E941}, {0x1E920, 0x1E942}, {0x1E921, 0x1E943}, }; + + +const std::unordered_map> unicode_decompose_map = { + {65, {65, 769}}, // Example: Unicode point A decomposes into A + combining acute accent + {231, {99, 807}} // Example: Unicode point รง decomposes into c + combining cedilla +}; + +const std::unordered_map unicode_canonical_class = { + {65, 0}, // Example: Unicode point A has canonical class 0 + {769, 1}, // Example: Combining acute accent has canonical class 1 + {99, 0}, // Example: Unicode point c has canonical class 0 + {807, 1} // Example: Combining cedilla has canonical class 1 +}; diff --git a/unicode-data.h b/unicode-data.h index cb9dd8aa5..0878ac15e 100644 --- a/unicode-data.h +++ b/unicode-data.h @@ -4,6 +4,7 @@ #include #include #include +#include extern const std::vector> unicode_ranges_digit; extern const std::vector> unicode_ranges_letter; @@ -14,3 +15,5 @@ extern const std::vector> unicode_ranges_symbol; extern const std::vector> unicode_ranges_control; extern const std::multimap unicode_map_nfd; extern const std::map unicode_map_lowercase; +extern const std::unordered_map> unicode_decompose_map; +extern const std::unordered_map unicode_canonical_class; diff --git a/unicode.cpp b/unicode.cpp index f2ccda05f..1d2c9f2f3 100644 --- a/unicode.cpp +++ b/unicode.cpp @@ -13,6 +13,8 @@ #include #include #include +#include +#include static std::string unicode_cpts_to_utf8(const std::vector & cps) { std::string result; @@ -469,6 +471,68 @@ std::string unicode_cpt_to_utf8(uint32_t cp) { throw std::invalid_argument("invalid codepoint"); } +// Function to recursively decompose a string +std::vector decompose_cpts(const std::vector & cpts) { + std::vector result; + for (const auto& cpt : cpts) { + auto it = unicode_decompose_map.find(cpt); + if (it != unicode_decompose_map.end()) { + for (const auto& decomp: it->second) { + const auto & inner_result = decompose_cpts({decomp}); + result.insert(result.end(), inner_result.begin(), inner_result.end()); + } + } else { + result.push_back(cpt); + } + } + return result; +} + +// Function to sort subsequences based on canonical class +std::vector sort_by_canonical_class(const std::vector & cpts) { + std::vector subsequence; + std::vector result; + auto compareByCanonicalClass = [&](const uint32_t& a, const uint32_t& b) { + auto cc_a_it = unicode_canonical_class.find(a); + if (cc_a_it != unicode_canonical_class.end()) { + auto cc_b_it = unicode_canonical_class.find(b); + if (cc_b_it != unicode_canonical_class.end()) { + return cc_a_it->second < cc_b_it->second; + } + + } + return false; + }; + + for (const auto& cpt : cpts) { + auto it = unicode_canonical_class.find(cpt); + if (it != unicode_canonical_class.end()) { + if (it->second > 0) { + subsequence.push_back(cpt); + } else { + if (!subsequence.empty()) { + sort(subsequence.begin(), subsequence.end(), compareByCanonicalClass); + for (const auto& codepoint : subsequence) { + result.push_back(codepoint); + } + subsequence.clear(); + } + + result.push_back(cpt); + } + } + } + + if (!subsequence.empty()) { + sort(subsequence.begin(), subsequence.end(), compareByCanonicalClass); + for (const auto& codepoint : subsequence) { + result.push_back(codepoint); + } + } + + return result; +} + std::vector unicode_cpts_normalize_nfd(const std::vector & cpts) { std::vector result; result.reserve(cpts.size()); @@ -483,6 +547,14 @@ std::vector unicode_cpts_normalize_nfd(const std::vector & c return result; } + +std::vector unicode_cpts_normalize_nfc(const std::vector & cpts) { + const auto &decomposed_cpts = decompose_cpts(cpts); + const auto &sorted_sequence = sort_by_canonical_class(decomposed_cpts); + //TODO: Do canonical composition + return sorted_sequence; +} + std::vector unicode_cpts_from_utf8(const std::string & utf8) { std::vector result; size_t offset = 0; diff --git a/unicode.h b/unicode.h index ce2bcef5a..ee8411d19 100644 --- a/unicode.h +++ b/unicode.h @@ -17,6 +17,9 @@ std::string unicode_cpt_to_utf8(uint32_t cp); std::vector unicode_cpts_from_utf8(const std::string & utf8); std::vector unicode_cpts_normalize_nfd(const std::vector & cpts); +std::vector unicode_cpts_normalize_nfc(const std::vector & cpts); +std::vector decompose_cpts(const std::vector & cpts); +std::vector sort_by_canonical_class(const std::vector & cpts); int unicode_cpt_type(uint32_t cp); int unicode_cpt_type(const std::string & utf8);