From eb5a0e162ae84604b6437750ae056077eaa88bf9 Mon Sep 17 00:00:00 2001 From: Joan Martinez Date: Tue, 7 May 2024 16:56:34 +0200 Subject: [PATCH] fix: do not insert in the middle of iteration --- unicode.cpp | 38 +++++++++++++++++++------------------- unicode.h | 4 ++-- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/unicode.cpp b/unicode.cpp index 6288e1151..19587b3b0 100644 --- a/unicode.cpp +++ b/unicode.cpp @@ -470,47 +470,47 @@ std::string unicode_cpt_to_utf8(uint32_t cp) { throw std::invalid_argument("invalid codepoint"); } +auto compareByCanonicalClass = [&](const uint32_t& a, const uint32_t& b) { + auto cc_a_it = unicode_canonical_class.find(a); + if (cc_a_it != unicode_canonical_class.end()) { + auto cc_b_it = unicode_canonical_class.find(b); + if (cc_b_it != unicode_canonical_class.end()) { + return cc_a_it->second < cc_b_it->second; + } + + } + return false; +}; + // Function to sort subsequences based on canonical class std::vector sort_by_canonical_class(std::vector & cpts) { - auto compareByCanonicalClass = [&](const uint32_t& a, const uint32_t& b) { - auto cc_a_it = unicode_canonical_class.find(a); - if (cc_a_it != unicode_canonical_class.end()) { - auto cc_b_it = unicode_canonical_class.find(b); - if (cc_b_it != unicode_canonical_class.end()) { - return cc_a_it->second < cc_b_it->second; - } - - } - return false; - }; - // Sort the sequence using the custom comparator function sort(cpts.begin(), cpts.end(), compareByCanonicalClass); return cpts; } -std::vector canonical_decomposition_cpts(std::vector & cpts, const std::vector::iterator& cpt_begin, const std::vector::iterator& cpt_end) { +std::vector canonical_decomposition_cpts(std::vector & cpts, uint32_t starting_offset) { std::vector result; - for (auto cpt_it = cpt_begin; cpt_it != cpt_end; ++cpt_it) { - auto it = unicode_map_nfd.equal_range(*cpt_it); + for (auto i = starting_offset; i < cpts.size(); i++) { + auto it = unicode_map_nfd.equal_range(cpts[i]); if (it.first != it.second) { uint offset = 0; for (auto jt = it.first; jt != it.second; jt++) { - cpts.insert(cpt_it + offset, jt->second); + cpts.emplace(cpts.begin() + i + offset, jt->second); offset++; } - const auto & inner_result = canonical_decomposition_cpts(cpts, cpt_it, cpt_end); + const auto & inner_result = canonical_decomposition_cpts(cpts, i); result.insert(result.end(), inner_result.begin(), inner_result.end()); break; } else { - result.push_back(*cpt_it); + result.push_back(cpts[i]); } } return result; } std::vector unicode_cpts_normalize_nfd(std::vector & cpts) { - auto result = canonical_decomposition_cpts(cpts, cpts.begin(), cpts.end()); + auto result = canonical_decomposition_cpts(cpts, 0); return sort_by_canonical_class(result); } diff --git a/unicode.h b/unicode.h index d31a2115f..89f170144 100644 --- a/unicode.h +++ b/unicode.h @@ -17,8 +17,8 @@ std::string unicode_cpt_to_utf8(uint32_t cp); std::vector unicode_cpts_from_utf8(const std::string & utf8); std::vector unicode_cpts_normalize_nfd(std::vector & cpts); -std::vector canonical_decomposition_cpts(std::vector & cpts, const std::vector::iterator& cpt_begin, const std::vector::iterator& cpt_end); -std::vector sort_by_canonical_class(const std::vector & cpts); +std::vector canonical_decomposition_cpts(std::vector & cpts, uint32_t starting_offset); +std::vector sort_by_canonical_class(std::vector & cpts); int unicode_cpt_type(uint32_t cp); int unicode_cpt_type(const std::string & utf8);