fix: lowercase unicode pt by unicode pt
This commit is contained in:
parent
605a6199e9
commit
728e1b4da0
3 changed files with 17 additions and 2 deletions
|
@ -13039,8 +13039,7 @@ struct llm_tokenizer_bpe {
|
||||||
case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH:
|
case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH:
|
||||||
//TODO: Apply lowercase + whitespace pretokenization
|
//TODO: Apply lowercase + whitespace pretokenization
|
||||||
{
|
{
|
||||||
std::string lowercase_text = text;
|
std::string lowercase_text = lowercase(text);
|
||||||
std::transform(lowercase_text.begin(), lowercase_text.end(), lowercase_text.begin(), [](unsigned char c){ return std::tolower(c); });
|
|
||||||
std::regex regexPattern("\\w+|[^\\w\\s]+");
|
std::regex regexPattern("\\w+|[^\\w\\s]+");
|
||||||
std::sregex_token_iterator it(lowercase_text.begin(), lowercase_text.end(), regexPattern);
|
std::sregex_token_iterator it(lowercase_text.begin(), lowercase_text.end(), regexPattern);
|
||||||
std::sregex_token_iterator end;
|
std::sregex_token_iterator end;
|
||||||
|
|
14
unicode.cpp
14
unicode.cpp
|
@ -794,3 +794,17 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
||||||
|
|
||||||
return unicode_byte_encoding_process(bpe_words);
|
return unicode_byte_encoding_process(bpe_words);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
std::string lowercase(const std::string & text) {
|
||||||
|
std::string lowercase("");
|
||||||
|
const std::vector<uint32_t> cpts = unicode_cpts_from_utf8(text);
|
||||||
|
|
||||||
|
for (const char32_t cpt : cpts) {
|
||||||
|
const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt));
|
||||||
|
lowercase += unicode_cpt_to_utf8(unicode_tolower(cpt)); // append char to word
|
||||||
|
}
|
||||||
|
|
||||||
|
return lowercase;
|
||||||
|
}
|
||||||
|
|
|
@ -61,3 +61,5 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8);
|
||||||
char32_t unicode_tolower(char32_t cp);
|
char32_t unicode_tolower(char32_t cp);
|
||||||
|
|
||||||
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
|
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
|
||||||
|
|
||||||
|
std::string lowercase(const std::string & text);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue