Detokenizer fixes (#8039)
* Add llama_detokenize(): - Update header files location - UNKNOWN and CONTROL are 'special pieces' - Remove space after UNKNOWN and CONTROL - Refactor llama_token_to_piece() - Add flag: clean_up_tokenization_spaces - Symmetric params for llama_tokenize() and llama_detokenize() * Update and fix tokenizer tests: - Using llama_detokenize() - Unexpected vocab type as test fail instead of error - Useful when automating tests: - If you don't know in advance the vocab type - Differenciate other loading errors - Skip unicode surrogaes and undefined - Gracefully exit threads - Using exit() is throwing random exceptions - Clean old known problematic codepoints - Minor: confusing hexadecimal codepoint * Update bruteforce random tests - Add detokenizer checks - New generator: ascii_lr_strip - New generator: apostrophe - Add more vocabs files - Detokenize special tokens. - Replace errors with '\uFFFD' when detokenizing to 'utf-8' - More edge cases - Better detokenization results check * Fix add_space_prefix, set false by default * Better leading space removal * Do not remove space when decoding special tokens * Bugfix: custom regexs splits undefined unicode codepoints * 'viking' detokenizer clean spaces
This commit is contained in:
parent
be20e7f49d
commit
213701b51a
11 changed files with 499 additions and 265 deletions
254
src/llama.cpp
254
src/llama.cpp
|
@ -1995,18 +1995,19 @@ using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
|||
|
||||
// NOTE: avoid ever using this except for building the token_to_piece caches
|
||||
static std::string llama_token_to_piece(const struct llama_model * model, llama_token token, bool special) {
|
||||
std::vector<char> result(8, 0);
|
||||
const int n_tokens = llama_token_to_piece(model, token, result.data(), result.size(), special);
|
||||
if (n_tokens < 0) {
|
||||
result.resize(-n_tokens);
|
||||
int check = llama_token_to_piece(model, token, result.data(), result.size(), special);
|
||||
GGML_ASSERT(check == -n_tokens);
|
||||
std::string piece;
|
||||
piece.resize(piece.capacity()); // using string internal cache
|
||||
const int n_chars = llama_token_to_piece(model, token, &piece[0], piece.size(), 0, special);
|
||||
if (n_chars < 0) {
|
||||
piece.resize(-n_chars);
|
||||
int check = llama_token_to_piece(model, token, &piece[0], piece.size(), 0, special);
|
||||
GGML_ASSERT(check == -n_chars);
|
||||
}
|
||||
else {
|
||||
result.resize(n_tokens);
|
||||
piece.resize(n_chars);
|
||||
}
|
||||
|
||||
return std::string(result.data(), result.size());
|
||||
return piece;
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
|
||||
|
@ -2586,10 +2587,11 @@ struct llama_vocab {
|
|||
id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
|
||||
|
||||
// tokenizer flags
|
||||
bool tokenizer_add_space_prefix = true;
|
||||
bool tokenizer_add_space_prefix = false;
|
||||
bool tokenizer_add_bos = false;
|
||||
bool tokenizer_add_eos = false;
|
||||
bool tokenizer_ignore_merges = false;
|
||||
bool tokenizer_clean_spaces = false; // clean_up_tokenization_spaces
|
||||
bool tokenizer_remove_extra_whitespaces = false;
|
||||
bool tokenizer_escape_whitespaces = true;
|
||||
bool tokenizer_treat_whitespace_as_suffix = false;
|
||||
|
@ -5230,11 +5232,6 @@ static void llm_load_vocab(
|
|||
vocab.special_pad_id = -1;
|
||||
vocab.special_cls_id = -1;
|
||||
vocab.special_mask_id = -1;
|
||||
|
||||
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
||||
if (add_space_prefix_keyidx != -1) {
|
||||
vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
||||
} // The default value of add_space_prefix is true.
|
||||
} else if (tokenizer_model == "bert") {
|
||||
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
||||
|
||||
|
@ -5246,15 +5243,9 @@ static void llm_load_vocab(
|
|||
vocab.special_pad_id = 0;
|
||||
vocab.special_cls_id = 101;
|
||||
vocab.special_mask_id = 103;
|
||||
vocab.tokenizer_add_space_prefix = false;
|
||||
} else if (tokenizer_model == "gpt2") {
|
||||
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
||||
|
||||
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
||||
if (add_space_prefix_keyidx != -1) {
|
||||
vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
||||
}
|
||||
|
||||
// read bpe merges and populate bpe ranks
|
||||
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
|
||||
if (merges_keyidx == -1) {
|
||||
|
@ -5333,6 +5324,8 @@ static void llm_load_vocab(
|
|||
|
||||
// for now, only BPE models have pre-tokenizers
|
||||
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
|
||||
vocab.tokenizer_add_space_prefix = false;
|
||||
vocab.tokenizer_clean_spaces = true;
|
||||
if (tokenizer_pre.empty()) {
|
||||
LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
|
||||
LLAMA_LOG_WARN("%s: \n", __func__);
|
||||
|
@ -5354,9 +5347,11 @@ static void llm_load_vocab(
|
|||
} else if (
|
||||
tokenizer_pre == "deepseek-llm") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
|
||||
vocab.tokenizer_clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "deepseek-coder") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
|
||||
vocab.tokenizer_clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "falcon") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
|
||||
|
@ -5368,6 +5363,7 @@ static void llm_load_vocab(
|
|||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
|
||||
} else if (
|
||||
tokenizer_pre == "gpt-2" ||
|
||||
tokenizer_pre == "phi-2" ||
|
||||
tokenizer_pre == "jina-es" ||
|
||||
tokenizer_pre == "jina-de" ||
|
||||
tokenizer_pre == "jina-v2-es" ||
|
||||
|
@ -5383,6 +5379,7 @@ static void llm_load_vocab(
|
|||
} else if (
|
||||
tokenizer_pre == "qwen2") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
||||
vocab.tokenizer_clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "stablelm2") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
|
||||
|
@ -5398,9 +5395,11 @@ static void llm_load_vocab(
|
|||
} else if (
|
||||
tokenizer_pre == "poro-chat") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
|
||||
vocab.tokenizer_clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "viking") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
|
||||
vocab.tokenizer_clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "jais") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
|
||||
|
@ -5409,10 +5408,14 @@ static void llm_load_vocab(
|
|||
}
|
||||
} else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
vocab.tokenizer_add_space_prefix = true;
|
||||
vocab.tokenizer_clean_spaces = false;
|
||||
vocab.tokenizer_add_bos = true;
|
||||
vocab.tokenizer_add_eos = false;
|
||||
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
vocab.tokenizer_add_space_prefix = false;
|
||||
vocab.tokenizer_clean_spaces = true;
|
||||
vocab.tokenizer_add_bos = true;
|
||||
vocab.tokenizer_add_eos = false;
|
||||
} else if (vocab.type == LLAMA_VOCAB_TYPE_UGM) {
|
||||
|
@ -5422,6 +5425,11 @@ static void llm_load_vocab(
|
|||
} else {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
}
|
||||
|
||||
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
||||
if (add_space_prefix_keyidx != -1) {
|
||||
vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
||||
}
|
||||
}
|
||||
|
||||
const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
|
||||
|
@ -5603,7 +5611,7 @@ static void llm_load_vocab(
|
|||
}
|
||||
}
|
||||
|
||||
std::sort( vocab.cache_special_tokens.begin(), vocab.cache_special_tokens.end(),
|
||||
std::sort(vocab.cache_special_tokens.begin(), vocab.cache_special_tokens.end(),
|
||||
[&] (const llama_vocab::id a, const llama_vocab::id b) {
|
||||
return vocab.id_to_token[a].text.size() > vocab.id_to_token[b].text.size();
|
||||
}
|
||||
|
@ -16098,7 +16106,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|||
// tokenizer.encode('', add_special_tokens=True) returns [1]
|
||||
// tokenizer.encode('', add_special_tokens=False) returns []
|
||||
|
||||
bool is_prev_special = false;
|
||||
bool is_prev_special = true; // prefix with space if first token
|
||||
|
||||
if (add_special && vocab.tokenizer_add_bos) {
|
||||
GGML_ASSERT(vocab.special_bos_id != -1);
|
||||
|
@ -16110,10 +16118,9 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|||
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
||||
|
||||
if (vocab.tokenizer_add_space_prefix) {
|
||||
if (!output.size() || is_prev_special) { // prefix with space if first token
|
||||
raw_text = " " + raw_text;
|
||||
}
|
||||
// prefix with space if previous is special
|
||||
if (vocab.tokenizer_add_space_prefix && is_prev_special) {
|
||||
raw_text = " " + raw_text;
|
||||
}
|
||||
|
||||
#ifdef PRETOKENIZERDEBUG
|
||||
|
@ -16122,6 +16129,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|||
llm_tokenizer_spm tokenizer(vocab);
|
||||
llama_escape_whitespace(raw_text);
|
||||
tokenizer.tokenize(raw_text, output);
|
||||
is_prev_special = false;
|
||||
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
||||
output.push_back(fragment.token);
|
||||
is_prev_special = true;
|
||||
|
@ -20904,85 +20912,66 @@ static std::string llama_decode_text(const std::string & text) {
|
|||
}
|
||||
|
||||
// does not write null-terminator to buf
|
||||
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
|
||||
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) {
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
|
||||
if (!special && llama_is_control_token(model->vocab, token)) {
|
||||
static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL;
|
||||
const llama_token_attr attr = llama_token_get_attr(model, token);
|
||||
if (!special && (attr & attr_special)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// copy piece chars to output text buffer
|
||||
// skip up to 'lstrip' leading spaces before copying
|
||||
auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
|
||||
for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
|
||||
token++;
|
||||
size--;
|
||||
}
|
||||
if (length < (int32_t)size) {
|
||||
return (int32_t) -size;
|
||||
}
|
||||
memcpy(buf, token, size);
|
||||
return (int32_t) size;
|
||||
};
|
||||
|
||||
// if we have a cache - use it
|
||||
{
|
||||
const auto & cache = model->vocab.cache_token_to_piece;
|
||||
|
||||
if (!cache.empty()) {
|
||||
const auto & res = cache.at(token);
|
||||
if (length < (int) res.size()) {
|
||||
return -(int) res.size();
|
||||
}
|
||||
memcpy(buf, res.c_str(), res.size());
|
||||
return res.size();
|
||||
const auto & result = cache.at(token);
|
||||
return _try_copy(result.data(), result.size());
|
||||
}
|
||||
}
|
||||
|
||||
if (0 <= token && token < llama_n_vocab(model)) {
|
||||
const std::string & token_text = model->vocab.id_to_token[token].text;
|
||||
switch (llama_vocab_get_type(model->vocab)) {
|
||||
case LLAMA_VOCAB_TYPE_WPM:
|
||||
case LLAMA_VOCAB_TYPE_SPM:
|
||||
case LLAMA_VOCAB_TYPE_UGM: {
|
||||
// NOTE: we accept all unsupported token types,
|
||||
// suppressing them like CONTROL tokens.
|
||||
if (llama_is_normal_token(model->vocab, token)) {
|
||||
std::string result = model->vocab.id_to_token[token].text;
|
||||
if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
|
||||
return _try_copy(token_text.data(), token_text.size());
|
||||
} else if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
|
||||
std::string result = token_text;
|
||||
llama_unescape_whitespace(result);
|
||||
if (length < (int) result.length()) {
|
||||
return -(int) result.length();
|
||||
}
|
||||
memcpy(buf, result.c_str(), result.length());
|
||||
return result.length();
|
||||
} else if (
|
||||
(llama_is_user_defined_token(model->vocab, token)) ||
|
||||
(llama_is_control_token (model->vocab, token) && special)) {
|
||||
std::string result = model->vocab.id_to_token[token].text;
|
||||
if (length < (int) result.length()) {
|
||||
return -(int) result.length();
|
||||
}
|
||||
memcpy(buf, result.c_str(), result.length());
|
||||
return result.length();
|
||||
} else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
|
||||
if (length < 3) {
|
||||
return -3;
|
||||
}
|
||||
memcpy(buf, "\xe2\x96\x85", 3);
|
||||
return 3;
|
||||
} else if (llama_is_byte_token(model->vocab, token)) {
|
||||
if (length < 1) {
|
||||
return -1;
|
||||
}
|
||||
buf[0] = llama_token_to_byte(model->vocab, token);
|
||||
return 1;
|
||||
return _try_copy(result.data(), result.size());
|
||||
} else if (attr & LLAMA_TOKEN_ATTR_BYTE) {
|
||||
char byte = (char) llama_token_to_byte(model->vocab, token);
|
||||
return _try_copy((char*) &byte, 1);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case LLAMA_VOCAB_TYPE_BPE: {
|
||||
// NOTE: we accept all unsupported token types,
|
||||
// suppressing them like CONTROL tokens.
|
||||
if (llama_is_normal_token(model->vocab, token)) {
|
||||
std::string result = model->vocab.id_to_token[token].text;
|
||||
result = llama_decode_text(result);
|
||||
if (length < (int) result.length()) {
|
||||
return -(int) result.length();
|
||||
}
|
||||
memcpy(buf, result.c_str(), result.length());
|
||||
return result.length();
|
||||
} else if (
|
||||
(llama_is_user_defined_token(model->vocab, token)) ||
|
||||
(llama_is_control_token (model->vocab, token) && special)) {
|
||||
std::string result = model->vocab.id_to_token[token].text;
|
||||
if (length < (int) result.length()) {
|
||||
return -(int) result.length();
|
||||
}
|
||||
memcpy(buf, result.c_str(), result.length());
|
||||
return result.length();
|
||||
if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
|
||||
return _try_copy(token_text.data(), token_text.size());
|
||||
} else if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
|
||||
std::string result = llama_decode_text(token_text);
|
||||
return _try_copy(result.data(), result.size());
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -20993,6 +20982,113 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|||
return 0;
|
||||
}
|
||||
|
||||
int32_t llama_detokenize(
|
||||
const struct llama_model * model,
|
||||
const llama_token * tokens,
|
||||
int32_t n_tokens,
|
||||
char * text,
|
||||
int32_t text_len_max,
|
||||
bool remove_special,
|
||||
bool unparse_special) {
|
||||
int32_t avail = text_len_max;
|
||||
int32_t total = 0;
|
||||
|
||||
// remove the leading space
|
||||
bool remove_space = model->vocab.tokenizer_add_space_prefix;
|
||||
|
||||
if (remove_special && model->vocab.tokenizer_add_bos) {
|
||||
if (n_tokens > 0 && tokens[0] == model->vocab.special_bos_id) {
|
||||
remove_space = false;
|
||||
n_tokens--;
|
||||
tokens++;
|
||||
}
|
||||
}
|
||||
|
||||
if (remove_special && model->vocab.tokenizer_add_eos) {
|
||||
if (n_tokens > 0 && tokens[n_tokens-1] == model->vocab.special_eos_id) {
|
||||
n_tokens--;
|
||||
}
|
||||
}
|
||||
|
||||
for (int32_t i = 0; i < n_tokens; ++i) {
|
||||
GGML_ASSERT(avail >= 0);
|
||||
int32_t n_chars = llama_token_to_piece(model, tokens[i], text, avail, remove_space, unparse_special);
|
||||
remove_space = false;
|
||||
if (n_chars < 0) {
|
||||
avail = 0;
|
||||
total -= n_chars;
|
||||
} else if (n_chars > 0) {
|
||||
avail -= n_chars;
|
||||
text += n_chars;
|
||||
total += n_chars;
|
||||
}
|
||||
}
|
||||
|
||||
if (total > text_len_max) {
|
||||
return -total;
|
||||
}
|
||||
|
||||
if (model->vocab.tokenizer_clean_spaces) {
|
||||
text -= total; // restart text
|
||||
|
||||
// first pass: characters ?!., //TODO: where do these characters come from?
|
||||
const int32_t total1 = total;
|
||||
total = total ? 1 : 0;
|
||||
for (int32_t i = 1; i < total1; ++i) {
|
||||
const char x = text[i];
|
||||
if (text[i - 1] == ' ') {
|
||||
if (x == '?' || x == '!' || x == '.' || x == ',') { // " ?", " !", " .", " ,"
|
||||
total--; // remove space
|
||||
}
|
||||
}
|
||||
text[total++] = x;
|
||||
}
|
||||
|
||||
// second pass: strip single apostrophe between spaces
|
||||
const int32_t total2 = total;
|
||||
total = total ? 1 : 0;
|
||||
for (int32_t i = 1; i < total2; ++i) {
|
||||
const char x = text[i];
|
||||
if (x == '\'' && i + 1 < total2 && text[i - 1] == ' ' && text[i + 1] == ' ') { // " ' "
|
||||
total--; // remove prev space
|
||||
text[++i] = '\0'; // remove next space
|
||||
}
|
||||
text[total++] = x;
|
||||
}
|
||||
|
||||
// third pass: apostrophe contractions //NOTE: this makes sense?
|
||||
const int32_t total3 = total;
|
||||
total = total ? 1 : 0;
|
||||
for (int32_t i = 1; i < total3; ++i) {
|
||||
const char x = text[i];
|
||||
if (text[i - 1] == ' ') {
|
||||
if (x == '\'' && i + 1 < total3) {
|
||||
const char x1 = text[i + 1];
|
||||
if (x1 == 't' || x1 == 'd') { // " 't", " 'd"
|
||||
//total--; // remove space
|
||||
} else if (x1 == 's' || x1 == 'm') { // " 's", " 'm"
|
||||
total--; // remove space
|
||||
} else if (i + 2 < total3) {
|
||||
const char x2 = text[i + 2];
|
||||
if ((x1 == 'l' && x2 == 'l')) { // " 'll"
|
||||
//total--; // remove space
|
||||
} else if ((x1 == 'r' && x2 == 'e') || (x1 == 'v' && x2 == 'e')) { // " 're", " 've"
|
||||
total--; // remove space
|
||||
} else {
|
||||
//total--; // remove space
|
||||
}
|
||||
} else {
|
||||
//total--; // remove space
|
||||
}
|
||||
}
|
||||
}
|
||||
text[total++] = x;
|
||||
}
|
||||
}
|
||||
|
||||
return total <= text_len_max ? total : -total;
|
||||
}
|
||||
|
||||
// trim whitespace from the beginning and end of a string
|
||||
static std::string trim(const std::string & str) {
|
||||
size_t start = 0;
|
||||
|
|
|
@ -232,8 +232,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
|||
};
|
||||
|
||||
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
|
||||
static const codepoint_flags undef(codepoint_flags::UNDEFINED);
|
||||
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
|
||||
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
|
||||
};
|
||||
|
||||
size_t _prev_end = offset_ini;
|
||||
|
@ -295,9 +294,9 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
|||
continue;
|
||||
}
|
||||
// regex: <space>?[^\s\p{L}\p{N}]+
|
||||
if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
|
||||
if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
|
||||
pos += (cpt == ' ');
|
||||
while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
|
||||
while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
|
||||
flags2 = _get_flags(++pos);
|
||||
}
|
||||
_add_token(pos);
|
||||
|
@ -351,8 +350,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
|||
};
|
||||
|
||||
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
|
||||
static const codepoint_flags undef(codepoint_flags::UNDEFINED);
|
||||
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
|
||||
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
|
||||
};
|
||||
|
||||
size_t _prev_end = offset_ini;
|
||||
|
@ -394,8 +392,8 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
|||
}
|
||||
}
|
||||
|
||||
// regex: [^\r\n\p{L}\p{N}]?\p{L}+ //####FIXME: the first \p{L} is correct?
|
||||
if (!(cpt == '\r' || cpt == '\n' || /*flags.is_letter |*/ flags.is_number)) {
|
||||
// regex: [^\r\n\p{L}\p{N}]?\p{L}+
|
||||
if (!(cpt == '\r' || cpt == '\n' || flags.is_number)) {
|
||||
if (flags.is_letter || _get_flags(pos+1).is_letter) { // one or more letters
|
||||
pos++;
|
||||
while (_get_flags(pos).is_letter) {
|
||||
|
@ -421,9 +419,9 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
|||
|
||||
// regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
|
||||
auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
|
||||
if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
|
||||
if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags.as_uint()) {
|
||||
pos += (cpt == ' ');
|
||||
while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
|
||||
while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
|
||||
flags2 = _get_flags(++pos);
|
||||
}
|
||||
uint32_t cpt2 = _get_cpt(pos);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue