From 728e1b4da0cbed99b817016115ec1a30f7281d61 Mon Sep 17 00:00:00 2001 From: Joan Martinez Date: Fri, 7 Jun 2024 09:55:21 +0200 Subject: [PATCH] fix: lowercase unicode pt by unicode pt --- llama.cpp | 3 +-- unicode.cpp | 14 ++++++++++++++ unicode.h | 2 ++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index d060da871..f95ecd39a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -13039,8 +13039,7 @@ struct llm_tokenizer_bpe { case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH: //TODO: Apply lowercase + whitespace pretokenization { - std::string lowercase_text = text; - std::transform(lowercase_text.begin(), lowercase_text.end(), lowercase_text.begin(), [](unsigned char c){ return std::tolower(c); }); + std::string lowercase_text = lowercase(text); std::regex regexPattern("\\w+|[^\\w\\s]+"); std::sregex_token_iterator it(lowercase_text.begin(), lowercase_text.end(), regexPattern); std::sregex_token_iterator end; diff --git a/unicode.cpp b/unicode.cpp index 056a4c741..695eb6f3e 100644 --- a/unicode.cpp +++ b/unicode.cpp @@ -794,3 +794,17 @@ std::vector unicode_regex_split(const std::string & text, const std return unicode_byte_encoding_process(bpe_words); } + + + +std::string lowercase(const std::string & text) { + std::string lowercase(""); + const std::vector cpts = unicode_cpts_from_utf8(text); + + for (const char32_t cpt : cpts) { + const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt)); + lowercase += unicode_cpt_to_utf8(unicode_tolower(cpt)); // append char to word + } + + return lowercase; +} diff --git a/unicode.h b/unicode.h index 7513be4ad..9b6317c60 100644 --- a/unicode.h +++ b/unicode.h @@ -61,3 +61,5 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8); char32_t unicode_tolower(char32_t cp); std::vector unicode_regex_split(const std::string & text, const std::vector & regex_exprs); + +std::string lowercase(const std::string & text);