Minor improvements in GPT2 tokenizer (#3567)
* Fixing minor bugs in bpe_gpt2_preprocess * Don't add bos token in test
This commit is contained in:
parent
c5b49360d0
commit
233fc1c69f
5 changed files with 17 additions and 20 deletions
|
@ -6342,7 +6342,6 @@ private:
|
|||
for (int i = 0; i < (int)text_utf.size(); i++) {
|
||||
const std::string & utf_char = text_utf[i];
|
||||
bool split_condition = false;
|
||||
// const char* text_pos = raw_text_p + utf_char.seq_offset_bytes;
|
||||
int bytes_remain = text_utf.size() - i;
|
||||
// forward backward lookups
|
||||
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
|
||||
|
@ -6368,9 +6367,9 @@ private:
|
|||
if (!split_condition && bytes_remain >= 3) {
|
||||
// 're|'ve|'ll
|
||||
if (utf_char == "\'" && (
|
||||
(utf_char_next == "r" || utf_char_next_next == "e") ||
|
||||
(utf_char_next == "v" || utf_char_next_next == "e") ||
|
||||
(utf_char_next == "l" || utf_char_next_next == "l"))
|
||||
(utf_char_next == "r" && utf_char_next_next == "e") ||
|
||||
(utf_char_next == "v" && utf_char_next_next == "e") ||
|
||||
(utf_char_next == "l" && utf_char_next_next == "l"))
|
||||
) {
|
||||
split_condition = true;
|
||||
}
|
||||
|
@ -6421,7 +6420,7 @@ private:
|
|||
else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
||||
split_condition = true;
|
||||
}
|
||||
else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) {
|
||||
else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
||||
split_condition = true;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue