From 2d219b389e8c8c40bce547b08c8aa7add60fde1f Mon Sep 17 00:00:00 2001 From: Christian Fillion Date: Fri, 7 Feb 2025 08:55:47 -0500 Subject: [PATCH] vocab : ignore invalid UTF-8 input in the BPE tokenizer (#11729) Silently insert U+FFFD(s) (Unicode replacement character) instead until the next valid codepoint can be found. This fixes `llama_tokenize` throwing an exception across the C API boundary or libllama's module boundary (the caller's runtime might be incompatible!) Returing a proper error code might be desirable, however the signature of `llama_tokenize` doesn't allow it as all return values already have existing meaning. --- src/unicode.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/unicode.cpp b/src/unicode.cpp index 89180da41..a32ae6d08 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -618,7 +618,14 @@ std::vector unicode_cpts_from_utf8(const std::string & utf8) { result.reserve(utf8.size()); size_t offset = 0; while (offset < utf8.size()) { - result.push_back(unicode_cpt_from_utf8(utf8, offset)); + try { + result.push_back(unicode_cpt_from_utf8(utf8, offset)); + } + catch (const std::invalid_argument & /*ex*/) { + // Silently ignore invalid UTF-8 input to avoid leaking the exception beyond llama_tokenize + ++offset; + result.emplace_back(0xFFFD); // replacement character + } } return result; }