feat: remove extra complexity in NFD

2024-05-08 10:12:10 +02:00 · 2024-05-08 10:12:10 +02:00 · 668e0d9b73
commit 668e0d9b73
parent 043f298775
6 changed files with 15 additions and 51 deletions
--- a/examples/server/tests/features/embeddings.feature
+++ b/examples/server/tests/features/embeddings.feature
@ -16,10 +16,17 @@ Feature: llama.cpp server
    Then  the server is starting
    Then  the server is healthy
  Scenario: Embedding
    When embeddings are computed for:
    """
    What is the capital of Bulgaria ?
    """
    Then embeddings are generated
  Scenario: Tokenize / Detokenize complex
    When tokenizing:
    """
-    España is your's mine's l'heure èspciâl café über naïve résumé cañón élite cañas Barça 例子 東京 こんにちは 你好 中国
+    北京的清晨，空氣清新而寧靜，一个年轻的旅行者在长城上漫步，他从自己的故乡—서울에서 출발하여 아시아의 다양한 문화를 탐험하고자 하는 꿈을 품고 떠났다。彼は日本の古都、京都を訪れ、そこで美しい桜の花が満開の下で古典音楽のコンサートに参加しました。祭りの夜、彼は色とりどりの灯籠が空に浮かぶのを見て、その美しさに感動しました。その後、彼は印度のバラナシに到着し、गंगा की घाटों पर आध्यात्मिक शांति की खोज में जुट गया। वहाँ उसने दिवाली के उत्सव में हिस्सा लिया, जहां लाखों दीये जलाकर समृद्धि और खुशहाली की कामना की गई थी।この旅は彼にとって非常に啓発的であり、多くの異なる文化から新しいことを学び、新しい友達を作る機会を与えました。彼はこの経験を通じて、 異なる文化の間の共通点と相違点を理解するようになりました。España is your's mine's l'heure èspciâl café über naïve résumé cañón élite cañas Barça 例子 東京 こんにちは 你好 中国
    """
    Then tokens can be detokenize and is equivalent False
--- a/llama.cpp
+++ b/llama.cpp
@ -12456,8 +12456,7 @@ struct llm_tokenizer_wpm {
    }
    std::vector<std::string> preprocess(const std::string & text) {
-        auto unicode_cpts = unicode_cpts_from_utf8(text);
+        std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
        std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts);
        // strip accents, strip control, uniformize whitespace,
        // to lowercase, pad chinese characters, pad punctuation
--- a/unicode-data.cpp
+++ b/unicode-data.cpp
--- a/unicode-data.h
+++ b/unicode-data.h
@ -14,4 +14,3 @@ extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_symbol;
 extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_control;
 extern const std::multimap<uint32_t, uint32_t>          unicode_map_nfd;
 extern const std::map<char32_t, char32_t>               unicode_map_lowercase;
 extern const std::map<uint32_t, uint32_t>               unicode_canonical_class;
--- a/unicode.cpp
+++ b/unicode.cpp
@ -13,7 +13,6 @@
 #include <vector>
 #include <locale>
 #include <codecvt>
 #include <algorithm>
 static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
    std::string result;
@ -470,54 +469,21 @@ std::string unicode_cpt_to_utf8(uint32_t cp) {
    throw std::invalid_argument("invalid codepoint");
 }
-auto compareByCanonicalClass = [&](const uint32_t& a, const uint32_t& b) {
+std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts) {
    auto cc_a_it = unicode_canonical_class.find(a);
    if (cc_a_it != unicode_canonical_class.end()) {
        auto cc_b_it = unicode_canonical_class.find(b);
        if (cc_b_it != unicode_canonical_class.end()) {
            return cc_a_it->second < cc_b_it->second;
        }
    }
    return false;
 };
 // Function to sort subsequences based on canonical class
 std::vector<uint32_t> sort_by_canonical_class(std::vector<uint32_t> & cpts) {
    // Sort the sequence using the custom comparator function
    sort(cpts.begin(), cpts.end(), compareByCanonicalClass);
    return cpts;
 }
 std::vector<uint32_t> canonical_decomposition_cpts(std::vector<uint32_t> & cpts, uint32_t starting_offset) {
    std::vector<uint32_t> result;
-    for (auto i = starting_offset; i < cpts.size(); i++) {
+    for (uint32_t cpt : cpts) {
-        const auto& it = unicode_map_nfd.equal_range(cpts[i]);
+        auto it = unicode_map_nfd.equal_range(cpt);
        if (it.first != it.second) {
            uint offset = 0;
            for (auto jt = it.first; jt != it.second; jt++) {
-                if (offset == 0) {
+                result.push_back(jt->second);
                    cpts[i] = jt->second;
                } else {
                    cpts.emplace(cpts.begin() + i + offset, jt->second);
                }
                offset++;
            }
            const auto & inner_result = canonical_decomposition_cpts(cpts, i);
            result.insert(result.end(), inner_result.begin(), inner_result.end());
            break;
        } else {
-            result.push_back(cpts[i]);
+            result.push_back(cpt);
        }
    }
    return result;
 }
 std::vector<uint32_t> unicode_cpts_normalize_nfd(std::vector<uint32_t> & cpts) {
    auto result = canonical_decomposition_cpts(cpts, 0);
    return sort_by_canonical_class(result);
 }
 std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
    std::vector<uint32_t> result;
    size_t offset = 0;
--- a/unicode.h
+++ b/unicode.h
@ -16,9 +16,7 @@
 std::string unicode_cpt_to_utf8(uint32_t cp);
 std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
-std::vector<uint32_t> unicode_cpts_normalize_nfd(std::vector<uint32_t> & cpts);
+std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
 std::vector<uint32_t> canonical_decomposition_cpts(std::vector<uint32_t> & cpts, uint32_t starting_offset);
 std::vector<uint32_t> sort_by_canonical_class(std::vector<uint32_t> & cpts);
 int unicode_cpt_type(uint32_t cp);
 int unicode_cpt_type(const std::string & utf8);