llama2.c: convert special-cased "<0xXX>" single byte tokens from tokenizer.bin

This commit is contained in:
ochafik 2023-08-23 19:56:16 +01:00
parent 8ad1e2d8d1
commit 8d0dc476c9

View file

@ -568,6 +568,12 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
float_t score = file.read_f32(); float_t score = file.read_f32();
uint32_t len = file.read_u32(); uint32_t len = file.read_u32();
std::string text = file.read_string(len); std::string text = file.read_string(len);
// Special-case handling of <0xXX> single byte tokens.
char byte_val;
if (sscanf(text.c_str(), "<0x%02hhX>", &byte_val) == 1) {
char cstr[2] = { byte_val, 0 };
text = cstr;
}
vocab->id_to_token[i].text = text; vocab->id_to_token[i].text = text;
vocab->id_to_token[i].score = score; vocab->id_to_token[i].score = score;
vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED; vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED;