Fix bpe_gpt2_preprocess
This commit is contained in:
parent
f026f8120f
commit
98d5d20044
1 changed files with 17 additions and 12 deletions
13
llama.cpp
13
llama.cpp
|
@ -7789,7 +7789,7 @@ private:
|
||||||
work_queue.push(bigram);
|
work_queue.push(bigram);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
|
static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
|
||||||
std::vector<std::string> bpe_words;
|
std::vector<std::string> bpe_words;
|
||||||
std::vector<std::string> bpe_encoded_words;
|
std::vector<std::string> bpe_encoded_words;
|
||||||
|
|
||||||
|
@ -7819,7 +7819,7 @@ private:
|
||||||
const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
|
const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
|
||||||
|
|
||||||
// handling contractions
|
// handling contractions
|
||||||
if (!split_condition && bytes_remain >= 2) {
|
if (!split_condition && bytes_remain >= 2 && token != " ") {
|
||||||
// 's|'t|'m|'d
|
// 's|'t|'m|'d
|
||||||
if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
|
if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
|
||||||
split_condition = true;
|
split_condition = true;
|
||||||
|
@ -7835,7 +7835,7 @@ private:
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!split_condition && bytes_remain >= 3) {
|
if (!split_condition && bytes_remain >= 3 && token != " ") {
|
||||||
// 're|'ve|'ll
|
// 're|'ve|'ll
|
||||||
if (utf_char == "\'" && (
|
if (utf_char == "\'" && (
|
||||||
(utf_char_next == "r" && utf_char_next_next == "e") ||
|
(utf_char_next == "r" && utf_char_next_next == "e") ||
|
||||||
|
@ -7891,7 +7891,7 @@ private:
|
||||||
else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
||||||
split_condition = true;
|
split_condition = true;
|
||||||
}
|
}
|
||||||
else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) {
|
||||||
split_condition = true;
|
split_condition = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -7913,9 +7913,14 @@ private:
|
||||||
collecting_whitespace_lookahead = false;
|
collecting_whitespace_lookahead = false;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
if (codepoint_type(token) == CODEPOINT_TYPE_PUNCTUATION && codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER) {
|
||||||
|
bpe_words.emplace_back(token);
|
||||||
|
token = utf_char;
|
||||||
|
} else {
|
||||||
token += utf_char;
|
token += utf_char;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for (std::string & word : bpe_words) {
|
for (std::string & word : bpe_words) {
|
||||||
std::string encoded_token = "";
|
std::string encoded_token = "";
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue