unicode : fix? unicode_wstring_to_utf8
This commit is contained in:
parent
36d983262e
commit
06d3e693db
3 changed files with 19 additions and 21 deletions
28
unicode.cpp
28
unicode.cpp
|
@ -197,18 +197,14 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
|
||||||
return map;
|
return map;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline std::wstring unicode_wstring_from_utf8(const std::string & s)
|
static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
|
||||||
{
|
|
||||||
std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
|
std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
|
||||||
return conv.from_bytes(s);
|
return conv.from_bytes(s);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline std::string unicode_wstring_to_utf8(const std::wstring & ws)
|
static inline std::string unicode_wstring_to_utf8(const std::wstring & ws) {
|
||||||
{
|
std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
|
||||||
// code to convert from utf32/utf16 to utf8
|
return conv.to_bytes(ws);
|
||||||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>, wchar_t> converter;
|
|
||||||
std::string utf8 = converter.to_bytes(ws);
|
|
||||||
return utf8;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::vector<std::string> unicode_byte_encoding_process(const std::vector<std::string> & bpe_words) {
|
static std::vector<std::string> unicode_byte_encoding_process(const std::vector<std::string> & bpe_words) {
|
||||||
|
@ -233,7 +229,7 @@ static std::vector<size_t> unicode_gpt2_regex_preprocess(const std::wstring & wt
|
||||||
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
|
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
|
||||||
size_t start = 0;
|
size_t start = 0;
|
||||||
|
|
||||||
for(auto offset : offsets) {
|
for (auto offset : offsets) {
|
||||||
const std::string text = unicode_wstring_to_utf8(std::wstring(wtext, start, offset));
|
const std::string text = unicode_wstring_to_utf8(std::wstring(wtext, start, offset));
|
||||||
|
|
||||||
std::string token = "";
|
std::string token = "";
|
||||||
|
@ -248,13 +244,15 @@ static std::vector<size_t> unicode_gpt2_regex_preprocess(const std::wstring & wt
|
||||||
text_utf.reserve(text.size());
|
text_utf.reserve(text.size());
|
||||||
|
|
||||||
const auto cpts = unicode_cpts_from_utf8(text);
|
const auto cpts = unicode_cpts_from_utf8(text);
|
||||||
for (size_t i = 0; i < cpts.size(); ++i)
|
for (size_t i = 0; i < cpts.size(); ++i) {
|
||||||
text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
|
text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < (int)text_utf.size(); i++) {
|
for (int i = 0; i < (int)text_utf.size(); i++) {
|
||||||
const std::string & utf_char = text_utf[i];
|
const std::string & utf_char = text_utf[i];
|
||||||
bool split_condition = false;
|
bool split_condition = false;
|
||||||
int bytes_remain = text_utf.size() - i;
|
int bytes_remain = text_utf.size() - i;
|
||||||
|
|
||||||
// forward backward lookups
|
// forward backward lookups
|
||||||
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
|
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
|
||||||
const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
|
const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
|
||||||
|
@ -357,6 +355,7 @@ static std::vector<size_t> unicode_gpt2_regex_preprocess(const std::wstring & wt
|
||||||
token += utf_char;
|
token += utf_char;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
start += offset;
|
start += offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -403,7 +402,7 @@ static bool unicode_regex_with_custom_preprocessor_exists(const std::string & re
|
||||||
static std::vector<size_t> unicode_regex_custom_preprocess(const std::string & regex, const std::wstring & wtext, const std::vector<size_t> & offsets) {
|
static std::vector<size_t> unicode_regex_custom_preprocess(const std::string & regex, const std::wstring & wtext, const std::vector<size_t> & offsets) {
|
||||||
std::vector<size_t> bpe_offsets;
|
std::vector<size_t> bpe_offsets;
|
||||||
|
|
||||||
if(regex == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
|
if (regex == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
|
||||||
bpe_offsets = unicode_gpt2_regex_preprocess(wtext, offsets);
|
bpe_offsets = unicode_gpt2_regex_preprocess(wtext, offsets);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -497,10 +496,9 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
||||||
|
|
||||||
std::vector<size_t> bpe_offsets = {wtext.size()};
|
std::vector<size_t> bpe_offsets = {wtext.size()};
|
||||||
|
|
||||||
for(auto & regex_expr : regex_exprs) {
|
for (auto & regex_expr : regex_exprs) {
|
||||||
|
|
||||||
if (unicode_regex_equivalent_wregex_exists(regex_expr)) {
|
if (unicode_regex_equivalent_wregex_exists(regex_expr)) {
|
||||||
const std::wstring& wregex_expr = unicode_regex_equivalent_wregex.at(regex_expr);
|
const std::wstring & wregex_expr = unicode_regex_equivalent_wregex.at(regex_expr);
|
||||||
bpe_offsets = unicode_regex_preprocess(wtext, bpe_offsets, wregex_expr);
|
bpe_offsets = unicode_regex_preprocess(wtext, bpe_offsets, wregex_expr);
|
||||||
} else if (unicode_regex_with_custom_preprocessor_exists(regex_expr)) {
|
} else if (unicode_regex_with_custom_preprocessor_exists(regex_expr)) {
|
||||||
bpe_offsets = unicode_regex_custom_preprocess(regex_expr, wtext, bpe_offsets);
|
bpe_offsets = unicode_regex_custom_preprocess(regex_expr, wtext, bpe_offsets);
|
||||||
|
@ -512,7 +510,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
||||||
std::vector<std::string> bpe_words;
|
std::vector<std::string> bpe_words;
|
||||||
bpe_words.reserve(bpe_offsets.size()); // Reserve memory for the approximate size
|
bpe_words.reserve(bpe_offsets.size()); // Reserve memory for the approximate size
|
||||||
size_t start = 0;
|
size_t start = 0;
|
||||||
for(size_t & offset : bpe_offsets) {
|
for (size_t & offset : bpe_offsets) {
|
||||||
bpe_words.emplace_back(unicode_wstring_to_utf8(std::wstring(wtext, start, offset)));
|
bpe_words.emplace_back(unicode_wstring_to_utf8(std::wstring(wtext, start, offset)));
|
||||||
start += offset;
|
start += offset;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue