From 8d0dc476c9ddd329cd22f0f1e970ce43dbe09168 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Wed, 23 Aug 2023 19:56:16 +0100
Subject: [PATCH] llama2.c: convert special-cased "<0xXX>" single byte tokens
 from tokenizer.bin

---
 .../convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp     | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index 3a7382ca4..1551a85cd 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -568,6 +568,12 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
             float_t score = file.read_f32();
             uint32_t len = file.read_u32();
             std::string text = file.read_string(len);
+            // Special-case handling of <0xXX> single byte tokens.
+            char byte_val;
+            if (sscanf(text.c_str(), "<0x%02hhX>", &byte_val) == 1) {
+                char cstr[2] = { byte_val, 0 };
+                text = cstr;
+            }
             vocab->id_to_token[i].text = text;
             vocab->id_to_token[i].score = score;
             vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED;