unicode : fix data race for unidentified codepoints

ggml-ci
This commit is contained in:
Georgi Gerganov 2024-02-13 13:08:12 +02:00
parent 21851c11d1
commit 6c533edb94
No known key found for this signature in database
GPG key ID: BF970631944C16B7

View file

@ -344,8 +344,9 @@ static uint32_t codepoint_from_utf16(const std::vector<uint16_t> & utf16, size_t
static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> & utf16) { static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> & utf16) {
std::vector<uint32_t> result; std::vector<uint32_t> result;
size_t offset = 0; size_t offset = 0;
while (offset < utf16.size()) while (offset < utf16.size()) {
result.push_back(codepoint_from_utf16(utf16, offset)); result.push_back(codepoint_from_utf16(utf16, offset));
}
return result; return result;
} }
@ -393,12 +394,13 @@ static std::unordered_map<uint32_t, int> codepoint_type_map() {
static int codepoint_type(uint32_t cp) { static int codepoint_type(uint32_t cp) {
static std::unordered_map<uint32_t, int> codepoint_types = codepoint_type_map(); static std::unordered_map<uint32_t, int> codepoint_types = codepoint_type_map();
return codepoint_types[cp]; return codepoint_types.find(cp) == codepoint_types.end() ? CODEPOINT_TYPE_UNIDENTIFIED : codepoint_types[cp];
} }
static int codepoint_type(const std::string & utf8) { static int codepoint_type(const std::string & utf8) {
if (utf8.length() == 0) if (utf8.length() == 0) {
return CODEPOINT_TYPE_UNIDENTIFIED; return CODEPOINT_TYPE_UNIDENTIFIED;
}
size_t offset = 0; size_t offset = 0;
return codepoint_type(codepoint_from_utf8(utf8, offset)); return codepoint_type(codepoint_from_utf8(utf8, offset));
} }