From 728e1b4da0cbed99b817016115ec1a30f7281d61 Mon Sep 17 00:00:00 2001
From: Joan Martinez <joan.fontanals.martinez@jina.ai>
Date: Fri, 7 Jun 2024 09:55:21 +0200
Subject: [PATCH] fix: lowercase unicode pt by unicode pt

---
 llama.cpp   |  3 +--
 unicode.cpp | 14 ++++++++++++++
 unicode.h   |  2 ++
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index d060da871..f95ecd39a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -13039,8 +13039,7 @@ struct llm_tokenizer_bpe {
                     case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH:
                         //TODO: Apply lowercase + whitespace pretokenization
                         {
-                            std::string lowercase_text = text;
-                            std::transform(lowercase_text.begin(), lowercase_text.end(), lowercase_text.begin(), [](unsigned char c){ return std::tolower(c); });
+                            std::string lowercase_text = lowercase(text);
                             std::regex regexPattern("\\w+|[^\\w\\s]+");
                             std::sregex_token_iterator it(lowercase_text.begin(), lowercase_text.end(), regexPattern);
                             std::sregex_token_iterator end;
diff --git a/unicode.cpp b/unicode.cpp
index 056a4c741..695eb6f3e 100644
--- a/unicode.cpp
+++ b/unicode.cpp
@@ -794,3 +794,17 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
 
     return unicode_byte_encoding_process(bpe_words);
 }
+
+
+
+std::string lowercase(const std::string & text) {
+    std::string lowercase("");
+    const std::vector<uint32_t> cpts = unicode_cpts_from_utf8(text);
+
+    for (const char32_t cpt : cpts) {
+        const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt));
+        lowercase += unicode_cpt_to_utf8(unicode_tolower(cpt));  // append char to word
+    }
+
+    return lowercase;
+}
diff --git a/unicode.h b/unicode.h
index 7513be4ad..9b6317c60 100644
--- a/unicode.h
+++ b/unicode.h
@@ -61,3 +61,5 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8);
 char32_t unicode_tolower(char32_t cp);
 
 std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
+
+std::string lowercase(const std::string & text);