unicode : fix data race for unidentified codepoints

ggml-ci
This commit is contained in:
Georgi Gerganov 2024-02-13 13:08:12 +02:00
parent 21851c11d1
commit 6c533edb94
No known key found for this signature in database
GPG key ID: BF970631944C16B7

View file

@ -344,8 +344,9 @@ static uint32_t codepoint_from_utf16(const std::vector<uint16_t> & utf16, size_t
static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> & utf16) { static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> & utf16) {
std::vector<uint32_t> result; std::vector<uint32_t> result;
size_t offset = 0; size_t offset = 0;
while (offset < utf16.size()) while (offset < utf16.size()) {
result.push_back(codepoint_from_utf16(utf16, offset)); result.push_back(codepoint_from_utf16(utf16, offset));
}
return result; return result;
} }
@ -364,19 +365,19 @@ static std::unordered_map<uint32_t, int> codepoint_type_map() {
for(auto i = p.first; i <= p.second; ++ i) for(auto i = p.first; i <= p.second; ++ i)
codepoint_types[i] = CODEPOINT_TYPE_DIGIT; codepoint_types[i] = CODEPOINT_TYPE_DIGIT;
} }
for(auto p : letter_ranges) { for (auto p : letter_ranges) {
for(auto i = p.first; i <= p.second; ++ i) for(auto i = p.first; i <= p.second; ++ i)
codepoint_types[i] = CODEPOINT_TYPE_LETTER; codepoint_types[i] = CODEPOINT_TYPE_LETTER;
} }
for(auto p : whitespace_ranges) { for (auto p : whitespace_ranges) {
for(auto i = p.first; i <= p.second; ++ i) for(auto i = p.first; i <= p.second; ++ i)
codepoint_types[i] = CODEPOINT_TYPE_WHITESPACE; codepoint_types[i] = CODEPOINT_TYPE_WHITESPACE;
} }
for(auto p : accent_mark_ranges) { for (auto p : accent_mark_ranges) {
for(auto i = p.first; i <= p.second; ++ i) for(auto i = p.first; i <= p.second; ++ i)
codepoint_types[i] = CODEPOINT_TYPE_ACCENT_MARK; codepoint_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
} }
for(auto p : punctuation_ranges) { for (auto p : punctuation_ranges) {
for(auto i = p.first; i <= p.second; ++ i) for(auto i = p.first; i <= p.second; ++ i)
codepoint_types[i] = CODEPOINT_TYPE_PUNCTUATION; codepoint_types[i] = CODEPOINT_TYPE_PUNCTUATION;
} }
@ -384,7 +385,7 @@ static std::unordered_map<uint32_t, int> codepoint_type_map() {
for (auto i = p.first; i <= p.second; ++i) for (auto i = p.first; i <= p.second; ++i)
codepoint_types[i] = CODEPOINT_TYPE_SYMBOL; codepoint_types[i] = CODEPOINT_TYPE_SYMBOL;
} }
for(auto p : control_ranges) { for (auto p : control_ranges) {
for(auto i = p.first; i <= p.second; ++ i) for(auto i = p.first; i <= p.second; ++ i)
codepoint_types[i] = CODEPOINT_TYPE_CONTROL; codepoint_types[i] = CODEPOINT_TYPE_CONTROL;
} }
@ -393,12 +394,13 @@ static std::unordered_map<uint32_t, int> codepoint_type_map() {
static int codepoint_type(uint32_t cp) { static int codepoint_type(uint32_t cp) {
static std::unordered_map<uint32_t, int> codepoint_types = codepoint_type_map(); static std::unordered_map<uint32_t, int> codepoint_types = codepoint_type_map();
return codepoint_types[cp]; return codepoint_types.find(cp) == codepoint_types.end() ? CODEPOINT_TYPE_UNIDENTIFIED : codepoint_types[cp];
} }
static int codepoint_type(const std::string & utf8) { static int codepoint_type(const std::string & utf8) {
if (utf8.length() == 0) if (utf8.length() == 0) {
return CODEPOINT_TYPE_UNIDENTIFIED; return CODEPOINT_TYPE_UNIDENTIFIED;
}
size_t offset = 0; size_t offset = 0;
return codepoint_type(codepoint_from_utf8(utf8, offset)); return codepoint_type(codepoint_from_utf8(utf8, offset));
} }