From 6ed2f795ae552929c855814870ad46bb2611a2ea Mon Sep 17 00:00:00 2001 From: thxCode Date: Tue, 6 Aug 2024 17:25:49 +0800 Subject: [PATCH] fix: crash on token not found at spm Signed-off-by: thxCode --- src/llama-vocab.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index e6d6059d0..79d79e2ce 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -279,7 +279,13 @@ private: // output any symbols that did not form tokens as bytes. output.reserve(output.size() + symbol.n); for (int j = 0; j < (int)symbol.n; ++j) { - llama_vocab::id token_id = llama_byte_to_token_impl(vocab, symbol.text[j]); + llama_vocab::id token_id; + try { + token_id = llama_byte_to_token_impl(vocab, symbol.text[j]); + } catch(const std::exception & e) { + // not found, use UNK token instead. + token_id = vocab.special_unk_id; + } output.push_back(token_id); } return;