From 72b353f5550a03382d6b54e890212b181a4533e4 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 13 Feb 2024 13:05:51 +0200 Subject: [PATCH] common : llama_byte_to_token: allow falling back to finding just the token byte in SPM vocabs --- llama.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 4e3a7f3d4..8ebbf7628 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7751,7 +7751,13 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) { switch (llama_vocab_get_type(vocab)) { case LLAMA_VOCAB_TYPE_SPM: { const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 }; - return vocab.token_to_id.at(buf); + auto token = vocab.token_to_id.find(buf); + if (token != vocab.token_to_id.end()) { + return (*token).second; + } + // Try to fall back to just the byte as a string + const char buf2[2] = { (char)ch, 0 }; + return vocab.token_to_id.at(buf2); } case LLAMA_VOCAB_TYPE_WPM: case LLAMA_VOCAB_TYPE_BPE: {