unicode : fix? unicode_wstring_to_utf8

This commit is contained in:
Georgi Gerganov 2024-04-26 12:55:11 +03:00
parent 36d983262e
commit 06d3e693db
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
3 changed files with 19 additions and 21 deletions

View file

@ -197,18 +197,14 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
return map; return map;
} }
static inline std::wstring unicode_wstring_from_utf8(const std::string & s) static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
{
std::wstring_convert<std::codecvt_utf8<wchar_t>> conv; std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
return conv.from_bytes(s); return conv.from_bytes(s);
} }
static inline std::string unicode_wstring_to_utf8(const std::wstring & ws) static inline std::string unicode_wstring_to_utf8(const std::wstring & ws) {
{ std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
// code to convert from utf32/utf16 to utf8 return conv.to_bytes(ws);
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>, wchar_t> converter;
std::string utf8 = converter.to_bytes(ws);
return utf8;
} }
static std::vector<std::string> unicode_byte_encoding_process(const std::vector<std::string> & bpe_words) { static std::vector<std::string> unicode_byte_encoding_process(const std::vector<std::string> & bpe_words) {
@ -233,7 +229,7 @@ static std::vector<size_t> unicode_gpt2_regex_preprocess(const std::wstring & wt
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
size_t start = 0; size_t start = 0;
for(auto offset : offsets) { for (auto offset : offsets) {
const std::string text = unicode_wstring_to_utf8(std::wstring(wtext, start, offset)); const std::string text = unicode_wstring_to_utf8(std::wstring(wtext, start, offset));
std::string token = ""; std::string token = "";
@ -248,13 +244,15 @@ static std::vector<size_t> unicode_gpt2_regex_preprocess(const std::wstring & wt
text_utf.reserve(text.size()); text_utf.reserve(text.size());
const auto cpts = unicode_cpts_from_utf8(text); const auto cpts = unicode_cpts_from_utf8(text);
for (size_t i = 0; i < cpts.size(); ++i) for (size_t i = 0; i < cpts.size(); ++i) {
text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i])); text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
}
for (int i = 0; i < (int)text_utf.size(); i++) { for (int i = 0; i < (int)text_utf.size(); i++) {
const std::string & utf_char = text_utf[i]; const std::string & utf_char = text_utf[i];
bool split_condition = false; bool split_condition = false;
int bytes_remain = text_utf.size() - i; int bytes_remain = text_utf.size() - i;
// forward backward lookups // forward backward lookups
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : ""; const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : ""; const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
@ -357,6 +355,7 @@ static std::vector<size_t> unicode_gpt2_regex_preprocess(const std::wstring & wt
token += utf_char; token += utf_char;
} }
} }
start += offset; start += offset;
} }
@ -403,7 +402,7 @@ static bool unicode_regex_with_custom_preprocessor_exists(const std::string & re
static std::vector<size_t> unicode_regex_custom_preprocess(const std::string & regex, const std::wstring & wtext, const std::vector<size_t> & offsets) { static std::vector<size_t> unicode_regex_custom_preprocess(const std::string & regex, const std::wstring & wtext, const std::vector<size_t> & offsets) {
std::vector<size_t> bpe_offsets; std::vector<size_t> bpe_offsets;
if(regex == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") { if (regex == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
bpe_offsets = unicode_gpt2_regex_preprocess(wtext, offsets); bpe_offsets = unicode_gpt2_regex_preprocess(wtext, offsets);
} }
@ -497,10 +496,9 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
std::vector<size_t> bpe_offsets = {wtext.size()}; std::vector<size_t> bpe_offsets = {wtext.size()};
for(auto & regex_expr : regex_exprs) { for (auto & regex_expr : regex_exprs) {
if (unicode_regex_equivalent_wregex_exists(regex_expr)) { if (unicode_regex_equivalent_wregex_exists(regex_expr)) {
const std::wstring& wregex_expr = unicode_regex_equivalent_wregex.at(regex_expr); const std::wstring & wregex_expr = unicode_regex_equivalent_wregex.at(regex_expr);
bpe_offsets = unicode_regex_preprocess(wtext, bpe_offsets, wregex_expr); bpe_offsets = unicode_regex_preprocess(wtext, bpe_offsets, wregex_expr);
} else if (unicode_regex_with_custom_preprocessor_exists(regex_expr)) { } else if (unicode_regex_with_custom_preprocessor_exists(regex_expr)) {
bpe_offsets = unicode_regex_custom_preprocess(regex_expr, wtext, bpe_offsets); bpe_offsets = unicode_regex_custom_preprocess(regex_expr, wtext, bpe_offsets);
@ -512,7 +510,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
std::vector<std::string> bpe_words; std::vector<std::string> bpe_words;
bpe_words.reserve(bpe_offsets.size()); // Reserve memory for the approximate size bpe_words.reserve(bpe_offsets.size()); // Reserve memory for the approximate size
size_t start = 0; size_t start = 0;
for(size_t & offset : bpe_offsets) { for (size_t & offset : bpe_offsets) {
bpe_words.emplace_back(unicode_wstring_to_utf8(std::wstring(wtext, start, offset))); bpe_words.emplace_back(unicode_wstring_to_utf8(std::wstring(wtext, start, offset)));
start += offset; start += offset;
} }