fix: lowercase unicode pt by unicode pt

This commit is contained in:
Joan Martinez 2024-06-07 09:55:21 +02:00
parent 605a6199e9
commit 728e1b4da0
3 changed files with 17 additions and 2 deletions

View file

@ -13039,8 +13039,7 @@ struct llm_tokenizer_bpe {
case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH: case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH:
//TODO: Apply lowercase + whitespace pretokenization //TODO: Apply lowercase + whitespace pretokenization
{ {
std::string lowercase_text = text; std::string lowercase_text = lowercase(text);
std::transform(lowercase_text.begin(), lowercase_text.end(), lowercase_text.begin(), [](unsigned char c){ return std::tolower(c); });
std::regex regexPattern("\\w+|[^\\w\\s]+"); std::regex regexPattern("\\w+|[^\\w\\s]+");
std::sregex_token_iterator it(lowercase_text.begin(), lowercase_text.end(), regexPattern); std::sregex_token_iterator it(lowercase_text.begin(), lowercase_text.end(), regexPattern);
std::sregex_token_iterator end; std::sregex_token_iterator end;

View file

@ -794,3 +794,17 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
return unicode_byte_encoding_process(bpe_words); return unicode_byte_encoding_process(bpe_words);
} }
std::string lowercase(const std::string & text) {
std::string lowercase("");
const std::vector<uint32_t> cpts = unicode_cpts_from_utf8(text);
for (const char32_t cpt : cpts) {
const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt));
lowercase += unicode_cpt_to_utf8(unicode_tolower(cpt)); // append char to word
}
return lowercase;
}

View file

@ -61,3 +61,5 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8);
char32_t unicode_tolower(char32_t cp); char32_t unicode_tolower(char32_t cp);
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs); std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
std::string lowercase(const std::string & text);