unicode : cleanup

This commit is contained in:
Georgi Gerganov 2024-04-29 11:20:42 +03:00
parent c68d2596ea
commit af05268cdd
No known key found for this signature in database
GPG key ID: 449E073F9DC10735

View file

@ -56,23 +56,22 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset)
offset += 4; offset += 4;
return result; return result;
} }
throw std::invalid_argument("invalid string"); throw std::invalid_argument("failed to convert utf8 to codepoint");
} }
static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cp) { //static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cp) {
std::vector<uint16_t> result; // std::vector<uint16_t> result;
if (/* 0x0000 <= cp && */ cp <= 0xffff) { // if (/* 0x0000 <= cp && */ cp <= 0xffff) {
result.emplace_back(cp); // result.emplace_back(cp);
} // return result;
else if (0x10000 <= cp && cp <= 0x10ffff) { // }
result.emplace_back(0xd800 | ((cp - 0x10000) >> 10)); // if (0x10000 <= cp && cp <= 0x10ffff) {
result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff)); // result.emplace_back(0xd800 | ((cp - 0x10000) >> 10));
} // result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff));
else { // return result;
throw std::invalid_argument("invalid cpt"); // }
} // throw std::invalid_argument("failed to convert codepoint to utf16");
return result; //}
}
//static std::vector<uint16_t> unicode_cpts_to_utf16(const std::vector<uint32_t> & cps) { //static std::vector<uint16_t> unicode_cpts_to_utf16(const std::vector<uint32_t> & cps) {
// std::vector<uint16_t> result; // std::vector<uint16_t> result;
@ -83,28 +82,28 @@ static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cp) {
// return result; // return result;
//} //}
static uint32_t cpt_from_utf16(const std::vector<uint16_t> & utf16, size_t & offset) { //static uint32_t unicode_cpt_from_utf16(const std::vector<uint16_t> & utf16, size_t & offset) {
assert(offset < utf16.size()); // assert(offset < utf16.size());
if (((utf16[0] >> 10) << 10) != 0xd800) { // if (((utf16[0] >> 10) << 10) != 0xd800) {
auto result = utf16[offset + 0]; // auto result = utf16[offset + 0];
offset += 1; // offset += 1;
return result; // return result;
} // }
//
if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) { // if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) {
throw std::invalid_argument("invalid character"); // throw std::invalid_argument("invalid character");
} // }
//
auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff)); // auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
offset += 2; // offset += 2;
return result; // return result;
} //}
//static std::vector<uint32_t> unicode_cpts_from_utf16(const std::vector<uint16_t> & utf16) { //static std::vector<uint32_t> unicode_cpts_from_utf16(const std::vector<uint16_t> & utf16) {
// std::vector<uint32_t> result; // std::vector<uint32_t> result;
// size_t offset = 0; // size_t offset = 0;
// while (offset < utf16.size()) { // while (offset < utf16.size()) {
// result.push_back(cpt_from_utf16(utf16, offset)); // result.push_back(unicode_cpt_from_utf16(utf16, offset));
// } // }
// return result; // return result;
//} //}