Options to mange token text decoding errors:

Some models ('jais' and 'command-r') copy original utf8 on error.
Others ('deepseek') seems to use the replacement character 0xFFFD.
This commit is contained in:
jaime-m-p 2024-07-09 01:28:56 +02:00
parent dec64ef793
commit c184db74b3

View file

@ -21084,7 +21084,8 @@ int32_t llama_tokenize(
return res.size(); return res.size();
} }
static std::string llama_decode_text(const std::string & text) { // errors: 'c': copy, 'i': ignore, 'r': replace 0xFFFD, 'v': verbose
static std::string llama_decode_text(const std::string & text, const char errors = 'v') {
std::string decoded_text; std::string decoded_text;
const auto cpts = unicode_cpts_from_utf8(text); const auto cpts = unicode_cpts_from_utf8(text);
@ -21093,11 +21094,21 @@ static std::string llama_decode_text(const std::string & text) {
try { try {
decoded_text += unicode_utf8_to_byte(utf8); decoded_text += unicode_utf8_to_byte(utf8);
} catch (const std::out_of_range & /*e*/) { } catch (const std::out_of_range & /*e*/) {
decoded_text += "[UNK_BYTE_0x"; switch (errors) {
for (const auto c : utf8) { case 'c':
decoded_text += format("%02x", (uint8_t) c); decoded_text += utf8; // copy original
break;
case 'r':
decoded_text += "\xEF\xBF\xBD"; // 0xFFFD REPLACEMENT CHARACTER
break;
case 'v':
decoded_text += format("[UNK_BYTE_0x%02X]", cpt);
break;
case 'i':
default:
// ignore
break;
} }
decoded_text += text + "]";
} }
} }
@ -21163,7 +21174,7 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) { if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
return _try_copy(token_text.data(), token_text.size()); return _try_copy(token_text.data(), token_text.size());
} else if (attr & LLAMA_TOKEN_ATTR_NORMAL) { } else if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
std::string result = llama_decode_text(token_text); std::string result = llama_decode_text(token_text, 'c'); // copy on error //TODO: use a tokenizer variable
return _try_copy(result.data(), result.size()); return _try_copy(result.data(), result.size());
} }
break; break;