From c184db74b31f5d898161a0b48366f24fe1d429e7 Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Tue, 9 Jul 2024 01:28:56 +0200 Subject: [PATCH] Options to mange token text decoding errors: Some models ('jais' and 'command-r') copy original utf8 on error. Others ('deepseek') seems to use the replacement character 0xFFFD. --- src/llama.cpp | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 2b9ace285..f61f60907 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -21084,7 +21084,8 @@ int32_t llama_tokenize( return res.size(); } -static std::string llama_decode_text(const std::string & text) { +// errors: 'c': copy, 'i': ignore, 'r': replace 0xFFFD, 'v': verbose +static std::string llama_decode_text(const std::string & text, const char errors = 'v') { std::string decoded_text; const auto cpts = unicode_cpts_from_utf8(text); @@ -21093,11 +21094,21 @@ static std::string llama_decode_text(const std::string & text) { try { decoded_text += unicode_utf8_to_byte(utf8); } catch (const std::out_of_range & /*e*/) { - decoded_text += "[UNK_BYTE_0x"; - for (const auto c : utf8) { - decoded_text += format("%02x", (uint8_t) c); + switch (errors) { + case 'c': + decoded_text += utf8; // copy original + break; + case 'r': + decoded_text += "\xEF\xBF\xBD"; // 0xFFFD REPLACEMENT CHARACTER + break; + case 'v': + decoded_text += format("[UNK_BYTE_0x%02X]", cpt); + break; + case 'i': + default: + // ignore + break; } - decoded_text += text + "]"; } } @@ -21163,7 +21174,7 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) { return _try_copy(token_text.data(), token_text.size()); } else if (attr & LLAMA_TOKEN_ATTR_NORMAL) { - std::string result = llama_decode_text(token_text); + std::string result = llama_decode_text(token_text, 'c'); // copy on error //TODO: use a tokenizer variable return _try_copy(result.data(), result.size()); } break;