Added needed functionality, testing remains
This commit is contained in:
parent
7e308ed212
commit
feeaf4f39c
5 changed files with 175 additions and 173 deletions
53
llama.cpp
53
llama.cpp
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -2,6 +2,8 @@
|
||||||
|
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <map>
|
#include <map>
|
||||||
|
#include <set>
|
||||||
|
#include <string>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
@ -14,4 +16,5 @@ extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_symbol;
|
||||||
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_control;
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_control;
|
||||||
extern const std::multimap<uint32_t, uint32_t> unicode_map_nfd;
|
extern const std::multimap<uint32_t, uint32_t> unicode_map_nfd;
|
||||||
extern const std::map<char32_t, char32_t> unicode_map_lowercase;
|
extern const std::map<char32_t, char32_t> unicode_map_lowercase;
|
||||||
extern const std::map<std::string, std::wstring> unicode_regex_to_wregex;
|
extern const std::map<std::string, std::wstring> unicode_regex_equivalent_wregex;
|
||||||
|
extern const std::set<std::string> unicode_regex_with_custom_preprocessor;
|
71
unicode.cpp
71
unicode.cpp
|
@ -228,8 +228,13 @@ static std::vector<std::string> unicode_byte_encoding_process(const std::vector<
|
||||||
return bpe_encoded_words;
|
return bpe_encoded_words;
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::vector<std::string> unicode_custom_preprocess(const std::string & text) {
|
static std::vector<size_t> unicode_gpt2_regex_preprocess(const std::wstring & wtext, const std::vector<size_t> & offsets) {
|
||||||
std::vector<std::string> bpe_words;
|
std::vector<size_t> bpe_offsets; // stroe the offset of each word
|
||||||
|
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
|
||||||
|
size_t start = 0;
|
||||||
|
|
||||||
|
for(auto offset : offsets) {
|
||||||
|
const std::string text = unicode_wstring_to_utf8(std::wstring(wtext, start, offset));
|
||||||
|
|
||||||
std::string token = "";
|
std::string token = "";
|
||||||
// GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
|
// GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
|
||||||
|
@ -241,7 +246,6 @@ static std::vector<std::string> unicode_custom_preprocess(const std::string & te
|
||||||
|
|
||||||
std::vector<std::string> text_utf;
|
std::vector<std::string> text_utf;
|
||||||
text_utf.reserve(text.size());
|
text_utf.reserve(text.size());
|
||||||
bpe_words.reserve(text.size());
|
|
||||||
|
|
||||||
const auto cpts = unicode_cpts_from_utf8(text);
|
const auto cpts = unicode_cpts_from_utf8(text);
|
||||||
for (size_t i = 0; i < cpts.size(); ++i)
|
for (size_t i = 0; i < cpts.size(); ++i)
|
||||||
|
@ -263,10 +267,10 @@ static std::vector<std::string> unicode_custom_preprocess(const std::string & te
|
||||||
}
|
}
|
||||||
if (split_condition) {
|
if (split_condition) {
|
||||||
if (token.size()) {
|
if (token.size()) {
|
||||||
bpe_words.emplace_back(token); // push previous content as token
|
bpe_offsets.emplace_back(unicode_wstring_from_utf8(token).size());
|
||||||
}
|
}
|
||||||
token = utf_char + utf_char_next;
|
token = utf_char + utf_char_next;
|
||||||
bpe_words.emplace_back(token);
|
bpe_offsets.emplace_back(unicode_wstring_from_utf8(token).size());
|
||||||
token = "";
|
token = "";
|
||||||
i++;
|
i++;
|
||||||
continue;
|
continue;
|
||||||
|
@ -284,10 +288,10 @@ static std::vector<std::string> unicode_custom_preprocess(const std::string & te
|
||||||
if (split_condition) {
|
if (split_condition) {
|
||||||
// current token + next token can be defined
|
// current token + next token can be defined
|
||||||
if (token.size()) {
|
if (token.size()) {
|
||||||
bpe_words.emplace_back(token); // push previous content as token
|
bpe_offsets.emplace_back(unicode_wstring_from_utf8(token).size());
|
||||||
}
|
}
|
||||||
token = utf_char + utf_char_next + utf_char_next_next;
|
token = utf_char + utf_char_next + utf_char_next_next;
|
||||||
bpe_words.emplace_back(token); // the contraction
|
bpe_offsets.emplace_back(unicode_wstring_from_utf8(token).size());
|
||||||
token = "";
|
token = "";
|
||||||
i += 2;
|
i += 2;
|
||||||
continue;
|
continue;
|
||||||
|
@ -340,7 +344,7 @@ static std::vector<std::string> unicode_custom_preprocess(const std::string & te
|
||||||
|
|
||||||
if (split_condition) {
|
if (split_condition) {
|
||||||
if (token.size()) {
|
if (token.size()) {
|
||||||
bpe_words.emplace_back(token);
|
bpe_offsets.emplace_back(unicode_wstring_from_utf8(token).size());
|
||||||
}
|
}
|
||||||
token = utf_char;
|
token = utf_char;
|
||||||
collecting = false;
|
collecting = false;
|
||||||
|
@ -353,8 +357,9 @@ static std::vector<std::string> unicode_custom_preprocess(const std::string & te
|
||||||
token += utf_char;
|
token += utf_char;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return bpe_words;
|
return bpe_offsets;
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::vector<size_t> unicode_regex_preprocess(const std::wstring & text, const std::vector<size_t> & offsets, const std::wstring & regex_expr) {
|
static std::vector<size_t> unicode_regex_preprocess(const std::wstring & text, const std::vector<size_t> & offsets, const std::wstring & regex_expr) {
|
||||||
|
@ -386,16 +391,22 @@ static std::vector<size_t> unicode_regex_preprocess(const std::wstring & text, c
|
||||||
return bpe_offsets;
|
return bpe_offsets;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool unicode_regex_matched(const std::wstring & text, const std::vector<std::wstring> & regex_exprs) {
|
static bool unicode_regex_equivalent_wregex_exists(const std::string & regex) {
|
||||||
|
return unicode_regex_equivalent_wregex.find(regex) != unicode_regex_equivalent_wregex.end();
|
||||||
for(auto & regex_expr: regex_exprs) {
|
|
||||||
std::wregex expr(regex_expr);
|
|
||||||
if(std::regex_match(text, expr)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
static bool unicode_regex_with_custom_preprocessor_exists(const std::string & regex) {
|
||||||
|
return unicode_regex_with_custom_preprocessor.find(regex) != unicode_regex_with_custom_preprocessor.end();
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::vector<size_t> unicode_regex_custom_preprocess(const std::string & regex, const std::wstring & wtext, const std::vector<size_t> & offsets) {
|
||||||
|
std::vector<size_t> bpe_offsets;
|
||||||
|
|
||||||
|
if(regex == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
|
||||||
|
bpe_offsets = unicode_gpt2_regex_preprocess(wtext, offsets);
|
||||||
|
}
|
||||||
|
|
||||||
|
return bpe_offsets;
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -480,32 +491,28 @@ char32_t unicode_tolower(char32_t cp) {
|
||||||
return it == unicode_map_lowercase.end() ? cp : it->second;
|
return it == unicode_map_lowercase.end() ? cp : it->second;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool unicode_wregex_exists(const std::string & regex) {
|
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
|
||||||
return unicode_regex_to_wregex.find(regex) != unicode_regex_to_wregex.end();
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::wstring> & regex_exprs) {
|
|
||||||
std::wstring wtext = unicode_wstring_from_utf8(text);
|
std::wstring wtext = unicode_wstring_from_utf8(text);
|
||||||
|
|
||||||
std::vector<size_t> bpe_offsets = {wtext.size()};
|
std::vector<size_t> bpe_offsets = {wtext.size()};
|
||||||
|
|
||||||
for(auto & regex_expr : regex_exprs) {
|
for(auto & regex_expr : regex_exprs) {
|
||||||
bpe_offsets = unicode_regex_preprocess(wtext, bpe_offsets, regex_expr);
|
|
||||||
|
if (unicode_regex_equivalent_wregex_exists(regex_expr)) {
|
||||||
|
const std::wstring& wregex_expr = unicode_regex_equivalent_wregex.at(regex_expr);
|
||||||
|
bpe_offsets = unicode_regex_preprocess(wtext, bpe_offsets, wregex_expr);
|
||||||
|
} else if (unicode_regex_with_custom_preprocessor_exists(regex_expr)) {
|
||||||
|
bpe_offsets = unicode_regex_custom_preprocess(regex_expr, wtext, bpe_offsets);
|
||||||
|
} else {
|
||||||
|
throw std::runtime_error("Unicode regex is not found");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::string> bpe_words;
|
std::vector<std::string> bpe_words;
|
||||||
bpe_words.reserve(bpe_offsets.size()); // Reserve memory for the approximate size
|
bpe_words.reserve(bpe_offsets.size()); // Reserve memory for the approximate size
|
||||||
size_t start = 0;
|
size_t start = 0;
|
||||||
for(size_t & offset : bpe_offsets) {
|
for(size_t & offset : bpe_offsets) {
|
||||||
const auto temp_word = std::wstring(wtext, start, offset);
|
bpe_words.emplace_back(unicode_wstring_to_utf8(std::wstring(wtext, start, offset)));
|
||||||
|
|
||||||
if(unicode_regex_matched(temp_word, regex_exprs)) {
|
|
||||||
bpe_words.emplace_back(unicode_wstring_to_utf8(temp_word));
|
|
||||||
} else {
|
|
||||||
auto custom_bpe_words = unicode_custom_preprocess(unicode_wstring_to_utf8(temp_word));
|
|
||||||
bpe_words.insert(bpe_words.end(), custom_bpe_words.begin(), custom_bpe_words.end());
|
|
||||||
}
|
|
||||||
|
|
||||||
start += offset;
|
start += offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -28,5 +28,4 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8);
|
||||||
|
|
||||||
char32_t unicode_tolower(char32_t cp);
|
char32_t unicode_tolower(char32_t cp);
|
||||||
|
|
||||||
bool unicode_wregex_exists(const std::string & regex);
|
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
|
||||||
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::wstring> & regex_exprs);
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue