From a95ae7526a63d496c7aebff3787e9720ce837298 Mon Sep 17 00:00:00 2001
From: klosax <131523366+klosax@users.noreply.github.com>
Date: Wed, 23 Aug 2023 00:02:13 +0200
Subject: [PATCH] llama.cpp : fix bpe tokenizer

---
 llama.cpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 4b539516c..61e734179 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2839,10 +2839,14 @@ static bool llama_is_pad_token(const llama_vocab & vocab, llama_token id ) {
 }
 
 static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
-    GGML_ASSERT(llama_is_byte_token(vocab, id));
-    const auto& token_data = vocab.id_to_token.at(id);
-    auto buf = token_data.text.substr(3, 2);
-    return strtol(buf.c_str(), NULL, 16);
+    if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
+        char buf[7];
+        int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch);
+        GGML_ASSERT(0 <= result && result < 7);
+        return vocab.token_to_id.at(buf);
+    }
+    // vocab.type == LLAMA_VOCAB_TYPE_BPE
+    return vocab.token_to_id.at(std::string(1, ch));
 }
 
 static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {