fix: lowercase unicode pt by unicode pt
This commit is contained in:
parent
605a6199e9
commit
728e1b4da0
3 changed files with 17 additions and 2 deletions
|
@ -13039,8 +13039,7 @@ struct llm_tokenizer_bpe {
|
|||
case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH:
|
||||
//TODO: Apply lowercase + whitespace pretokenization
|
||||
{
|
||||
std::string lowercase_text = text;
|
||||
std::transform(lowercase_text.begin(), lowercase_text.end(), lowercase_text.begin(), [](unsigned char c){ return std::tolower(c); });
|
||||
std::string lowercase_text = lowercase(text);
|
||||
std::regex regexPattern("\\w+|[^\\w\\s]+");
|
||||
std::sregex_token_iterator it(lowercase_text.begin(), lowercase_text.end(), regexPattern);
|
||||
std::sregex_token_iterator end;
|
||||
|
|
14
unicode.cpp
14
unicode.cpp
|
@ -794,3 +794,17 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
|||
|
||||
return unicode_byte_encoding_process(bpe_words);
|
||||
}
|
||||
|
||||
|
||||
|
||||
std::string lowercase(const std::string & text) {
|
||||
std::string lowercase("");
|
||||
const std::vector<uint32_t> cpts = unicode_cpts_from_utf8(text);
|
||||
|
||||
for (const char32_t cpt : cpts) {
|
||||
const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt));
|
||||
lowercase += unicode_cpt_to_utf8(unicode_tolower(cpt)); // append char to word
|
||||
}
|
||||
|
||||
return lowercase;
|
||||
}
|
||||
|
|
|
@ -61,3 +61,5 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8);
|
|||
char32_t unicode_tolower(char32_t cp);
|
||||
|
||||
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
|
||||
|
||||
std::string lowercase(const std::string & text);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue