From 22a011329963834e1a948b53143cf18e5d8aca57 Mon Sep 17 00:00:00 2001
From: Joan Martinez <joan.fontanals.martinez@jina.ai>
Date: Mon, 13 May 2024 10:27:23 +0200
Subject: [PATCH] fix: fix alignment

---
 llama.cpp      | 18 ++++++-------
 llama.h        |  1 -
 unicode-data.h |  5 +---
 unicode.cpp    | 72 --------------------------------------------------
 unicode.h      |  3 ---
 5 files changed, 10 insertions(+), 89 deletions(-)
diff --git a/llama.cpp b/llama.cpp
index 92d1eeeca..eff22bb96 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8241,9 +8241,6 @@ struct llm_build_context {
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
         struct ggml_tensor * KQ_mask = build_inp_KQ_mask(false);
 
-        // positions of the tokens in the KV cache
-        struct ggml_tensor * KQ_pos = build_inp_KQ_pos(false);
-
         // iterate layers
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * cur = inpL;
@@ -8386,7 +8383,6 @@ struct llm_build_context {
             // output layer norm
             cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, cb, il);
 
-
             // input for next layer
             inpL = cur;
         }
@@ -11506,7 +11502,7 @@ static int llama_decode_internal(
         }
 
         // non-causal masks do not use the KV cache
-        if (hparams.causal_attn || model.arch == LLM_ARCH_JINA_BERT_V2) {
+        if (hparams.causal_attn) {
             llama_kv_cache_update(&lctx);
 
             // if we have enough unused cells before the current head ->
@@ -12350,10 +12346,14 @@ struct llm_tokenizer_bpe {
                         break;
                     case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH:
                         //TODO: Apply GPT2 + lowercasing
-                        word_collection = unicode_regex_split(text, {
-                            "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
-                        });
-                        //TODO: Apply lowercasing
+                        {
+                            std::string lowercase_text = text;
+                            std::transform(lowercase_text.begin(), lowercase_text.end(), lowercase_text.begin(), [](unsigned char c){ return std::tolower(c); });
+                            word_collection = unicode_regex_split(lowercase_text, {
+                                "",
+                            });
+                        }
+                        break;
                     default:
                         // default regex for BPE tokenization pre-processing
                         word_collection = unicode_regex_split(text, {
diff --git a/llama.h b/llama.h
index d24e3cd96..5c9fc9a2f 100644
--- a/llama.h
+++ b/llama.h
@@ -71,7 +71,6 @@ extern "C" {
 
     // pre-tokenization types
     enum llama_vocab_pre_type {
-
         LLAMA_VOCAB_PRE_TYPE_DEFAULT        = 0,
         LLAMA_VOCAB_PRE_TYPE_LLAMA3         = 1,
         LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM   = 2,
diff --git a/unicode-data.h b/unicode-data.h
index fc2ea944f..a9c2fd258 100644
--- a/unicode-data.h
+++ b/unicode-data.h
@@ -4,7 +4,6 @@
 #include <map>
 #include <utility>
 #include <vector>
-#include <unordered_map>
 
 extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_number;
 extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
@@ -15,6 +14,4 @@ extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuati
 extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_symbol;
 extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_control;
 extern const std::multimap<uint32_t, uint32_t>          unicode_map_nfd;
-extern const std::map<char32_t, char32_t>               unicode_map_lowercase;
-extern const std::unordered_map<uint32_t, std::vector<uint32_t>> unicode_decompose_map;
-extern const std::unordered_map<uint32_t, uint32_t> unicode_canonical_class;
+extern const std::map<char32_t, char32_t>               unicode_map_lowercase;
\ No newline at end of file
diff --git a/unicode.cpp b/unicode.cpp
index c2b9ba9c6..ca03c49d3 100644
--- a/unicode.cpp
+++ b/unicode.cpp
@@ -14,8 +14,6 @@
 #include <vector>
 #include <locale>
 #include <codecvt>
-#include <unicode/unistr.h>
-#include <unicode/unorm2.h>
 
 static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
     std::string result;
@@ -590,68 +588,6 @@ std::string unicode_cpt_to_utf8(uint32_t cp) {
     throw std::invalid_argument("invalid codepoint");
 }
 
-// Function to recursively decompose a string
-std::vector<uint32_t> decompose_cpts(const std::vector<uint32_t> & cpts) {
-    std::vector<uint32_t> result;
-    for (const auto& cpt : cpts) {
-        auto it = unicode_decompose_map.find(cpt);
-        if (it != unicode_decompose_map.end()) {
-            for (const auto& decomp: it->second) {
-                const auto & inner_result = decompose_cpts({decomp});
-                result.insert(result.end(), inner_result.begin(), inner_result.end());
-            }
-        } else {
-            result.push_back(cpt);
-        }
-    }
-    return result;
-}
-
-// Function to sort subsequences based on canonical class
-std::vector<uint32_t> sort_by_canonical_class(const std::vector<uint32_t> & cpts) {
-    std::vector<uint32_t> subsequence;
-    std::vector<uint32_t> result;
-    auto compareByCanonicalClass = [&](const uint32_t& a, const uint32_t& b) {
-        auto cc_a_it = unicode_canonical_class.find(a);
-        if (cc_a_it != unicode_canonical_class.end()) {
-            auto cc_b_it = unicode_canonical_class.find(b);
-            if (cc_b_it != unicode_canonical_class.end()) {
-                return cc_a_it->second < cc_b_it->second;
-            }
-
-        }
-        return false;
-    };
-
-    for (const auto& cpt : cpts) {
-        auto it = unicode_canonical_class.find(cpt);
-        if (it != unicode_canonical_class.end()) {
-            if (it->second > 0) {
-                subsequence.push_back(cpt);
-            } else {
-                if (!subsequence.empty()) {
-                    sort(subsequence.begin(), subsequence.end(), compareByCanonicalClass);
-                    for (const auto& codepoint : subsequence) {
-                        result.push_back(codepoint);
-                    }
-                    subsequence.clear();
-                }
-               
-                result.push_back(cpt);
-            }
-        }
-    }
-
-    if (!subsequence.empty()) {
-        sort(subsequence.begin(), subsequence.end(), compareByCanonicalClass);
-        for (const auto& codepoint : subsequence) {
-            result.push_back(codepoint);
-        }
-    }
-
-    return result;
-}
-
 std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts) {
     std::vector<uint32_t> result;
     result.reserve(cpts.size());
@@ -666,14 +602,6 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
     return result;
 }
 
-
-std::vector<uint32_t> unicode_cpts_normalize_nfc(const std::vector<uint32_t> & cpts) {
-    const auto &decomposed_cpts = decompose_cpts(cpts);
-    const auto &sorted_sequence = sort_by_canonical_class(decomposed_cpts);
-    //TODO: Do canonical composition
-    return sorted_sequence;
-}
-
 std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
     std::vector<uint32_t> result;
     size_t offset = 0;
diff --git a/unicode.h b/unicode.h
index 3f4938d4d..d6a14d470 100644
--- a/unicode.h
+++ b/unicode.h
@@ -17,9 +17,6 @@ std::string unicode_cpt_to_utf8(uint32_t cp);
 std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
 
 std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
-std::vector<uint32_t> unicode_cpts_normalize_nfc(const std::vector<uint32_t> & cpts);
-std::vector<uint32_t> decompose_cpts(const std::vector<uint32_t> & cpts);
-std::vector<uint32_t> sort_by_canonical_class(const std::vector<uint32_t> & cpts);
 
 int unicode_cpt_type(uint32_t cp);
 int unicode_cpt_type(const std::string & utf8);