minor : code style normalization

This commit is contained in:
Georgi Gerganov 2024-02-11 12:59:59 +02:00
parent 6972e7e90e
commit 8fbefed148
No known key found for this signature in database
GPG key ID: 449E073F9DC10735

View file

@ -8262,18 +8262,19 @@ struct llm_tokenizer_wpm {
return words; return words;
} }
std::string normalize(const std::string &text) { std::string normalize(const std::string & text) {
// TODO: handle chinese characters? https://github.com/huggingface/tokenizers/blob/ef5f50605ddf9f8caef1598c0e4853862b9707a7/tokenizers/src/normalizers/bert.rs#L98 // TODO: handle chinese characters? https://github.com/huggingface/tokenizers/blob/ef5f50605ddf9f8caef1598c0e4853862b9707a7/tokenizers/src/normalizers/bert.rs#L98
std::string text2 = strip_accents(text); std::string text2 = strip_accents(text);
for (size_t i = 0; i < text2.size(); i += utf8_len(text2[i])) { for (size_t i = 0; i < text2.size(); i += utf8_len(text2[i])) {
char c = text2[i]; char c = text2[i];
if (c >= 'A' && c <= 'Z') if (c >= 'A' && c <= 'Z') {
text2[i] = c - 'A' + 'a'; text2[i] = c - 'A' + 'a';
} }
}
return text2; return text2;
} }
bool is_chinese_char(const std::string& str) { bool is_chinese_char(const std::string & str) {
int len = str.length(); int len = str.length();
unsigned int codepoint = 0; unsigned int codepoint = 0;
int num_bytes = 0; int num_bytes = 0;
@ -8312,14 +8313,14 @@ struct llm_tokenizer_wpm {
(codepoint >= 0x2F800 && codepoint <= 0x2FA1F) || (codepoint >= 0x2F800 && codepoint <= 0x2FA1F) ||
(codepoint >= 0x3000 && codepoint <= 0x303F) || (codepoint >= 0x3000 && codepoint <= 0x303F) ||
(codepoint >= 0xFF00 && codepoint <= 0xFFEF)) { (codepoint >= 0xFF00 && codepoint <= 0xFFEF)) {
return true; return true; // NOLINT
} }
return false; return false;
} }
std::string strip_accents(const std::string &inputString) { std::string strip_accents(const std::string & input_string) {
std::string resultString; std::string resultString;
std::map<std::string, char> accentMap = { std::map<std::string, char> accent_map = {
{"À", 'A'}, {"Á", 'A'}, {"Â", 'A'}, {"Ã", 'A'}, {"Ä", 'A'}, {"Å", 'A'}, {"À", 'A'}, {"Á", 'A'}, {"Â", 'A'}, {"Ã", 'A'}, {"Ä", 'A'}, {"Å", 'A'},
{"à", 'a'}, {"á", 'a'}, {"â", 'a'}, {"ã", 'a'}, {"ä", 'a'}, {"å", 'a'}, {"à", 'a'}, {"á", 'a'}, {"â", 'a'}, {"ã", 'a'}, {"ä", 'a'}, {"å", 'a'},
{"È", 'E'}, {"É", 'E'}, {"Ê", 'E'}, {"Ë", 'E'}, {"è", 'e'}, {"é", 'e'}, {"È", 'E'}, {"É", 'E'}, {"Ê", 'E'}, {"Ë", 'E'}, {"è", 'e'}, {"é", 'e'},
@ -8331,11 +8332,11 @@ struct llm_tokenizer_wpm {
{"Ç", 'C'}, {"ç", 'c'}, {"Ñ", 'N'}, {"ñ", 'n'}, {"Ç", 'C'}, {"ç", 'c'}, {"Ñ", 'N'}, {"ñ", 'n'},
}; };
for (size_t i = 0; i < inputString.length();) { for (size_t i = 0; i < input_string.length();) {
int len = utf8_len(inputString[i]); int len = utf8_len(input_string[i]);
std::string curChar = inputString.substr(i, len); std::string curChar = input_string.substr(i, len);
auto iter = accentMap.find(curChar); auto iter = accent_map.find(curChar);
if (iter != accentMap.end()) { if (iter != accent_map.end()) {
resultString += iter->second; resultString += iter->second;
} else { } else {
resultString += curChar; resultString += curChar;