From 8d0dc476c9ddd329cd22f0f1e970ce43dbe09168 Mon Sep 17 00:00:00 2001 From: ochafik Date: Wed, 23 Aug 2023 19:56:16 +0100 Subject: [PATCH] llama2.c: convert special-cased "<0xXX>" single byte tokens from tokenizer.bin --- .../convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index 3a7382ca4..1551a85cd 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -568,6 +568,12 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) float_t score = file.read_f32(); uint32_t len = file.read_u32(); std::string text = file.read_string(len); + // Special-case handling of <0xXX> single byte tokens. + char byte_val; + if (sscanf(text.c_str(), "<0x%02hhX>", &byte_val) == 1) { + char cstr[2] = { byte_val, 0 }; + text = cstr; + } vocab->id_to_token[i].text = text; vocab->id_to_token[i].score = score; vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED;