unicode : switch to multimap based nfd_map (#5799)
* switch to multimap based nfd_map due to compile time issues * simplify multimap keys * dont construct new locale every time
This commit is contained in:
parent
5cb02b4a01
commit
9600d59e01
2 changed files with 312 additions and 265 deletions
11
llama.cpp
11
llama.cpp
|
@ -8947,10 +8947,10 @@ struct llm_tokenizer_wpm {
|
|||
std::vector<uint32_t> codepoints = codepoints_from_utf8(text);
|
||||
std::vector<uint32_t> nfd_codepoints;
|
||||
for (uint32_t code : codepoints) {
|
||||
auto it = nfd_map.find(code);
|
||||
if (it != nfd_map.end()) {
|
||||
for (uint32_t c : it->second) {
|
||||
nfd_codepoints.push_back(c);
|
||||
auto it = nfd_map.equal_range(code);
|
||||
if (it.first != it.second) {
|
||||
for (auto jt = it.first; jt != it.second; jt++) {
|
||||
nfd_codepoints.push_back(jt->second);
|
||||
}
|
||||
} else {
|
||||
nfd_codepoints.push_back(code);
|
||||
|
@ -9001,12 +9001,13 @@ struct llm_tokenizer_wpm {
|
|||
}
|
||||
|
||||
uint32_t to_lower(uint32_t code) {
|
||||
static const std::locale locale("en_US.UTF-8");
|
||||
#if defined(_WIN32)
|
||||
if (code > 0xFFFF) {
|
||||
return code;
|
||||
}
|
||||
#endif
|
||||
return std::tolower(wchar_t(code), std::locale("en_US.UTF-8"));
|
||||
return std::tolower(wchar_t(code), locale);
|
||||
}
|
||||
|
||||
bool is_ascii_punct(uint32_t code) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue