wpm : portable unicode tolower (#6305)
Also use C locale for ispunct/isspace, and split unicode-data.cpp from unicode.cpp.
This commit is contained in:
parent
557410b8f0
commit
32c8486e1f
9 changed files with 1699 additions and 1425 deletions
22
llama.cpp
22
llama.cpp
|
@ -61,6 +61,7 @@
|
|||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <cassert>
|
||||
#include <cctype>
|
||||
#include <cfloat>
|
||||
#include <cinttypes>
|
||||
#include <climits>
|
||||
|
@ -71,7 +72,6 @@
|
|||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <cwctype>
|
||||
#include <forward_list>
|
||||
#include <fstream>
|
||||
#include <functional>
|
||||
|
@ -11010,7 +11010,7 @@ struct llm_tokenizer_wpm {
|
|||
if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
|
||||
continue;
|
||||
}
|
||||
code = to_lower(code);
|
||||
code = unicode_tolower(code);
|
||||
if (type == CODEPOINT_TYPE_WHITESPACE) {
|
||||
code = ' ';
|
||||
}
|
||||
|
@ -11030,7 +11030,7 @@ struct llm_tokenizer_wpm {
|
|||
std::vector<std::string> words;
|
||||
while (r < new_str.size()) {
|
||||
// if is whitespace
|
||||
if (isspace(new_str[r])) {
|
||||
if (isspace(new_str[r], std::locale::classic())) {
|
||||
if (r > l) words.push_back(new_str.substr(l, (r - l)));
|
||||
l = r + 1;
|
||||
r = l;
|
||||
|
@ -11044,18 +11044,12 @@ struct llm_tokenizer_wpm {
|
|||
return words;
|
||||
}
|
||||
|
||||
uint32_t to_lower(uint32_t code) {
|
||||
static const std::locale locale("en_US.UTF-8");
|
||||
#if defined(_WIN32)
|
||||
if (code > 0xFFFF) {
|
||||
return code;
|
||||
}
|
||||
#endif
|
||||
return std::tolower(wchar_t(code), locale);
|
||||
}
|
||||
|
||||
bool is_ascii_punct(uint32_t code) {
|
||||
return code < 256 && ispunct(code);
|
||||
if (code > 0xFF) {
|
||||
return false;
|
||||
}
|
||||
auto c = char(static_cast<unsigned char>(code));
|
||||
return ispunct(c, std::locale::classic());
|
||||
}
|
||||
|
||||
bool is_chinese_char(uint32_t cpt) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue