llama2.c: convert special-cased "<0xXX>" single byte tokens from tokenizer.bin
This commit is contained in:
parent
8ad1e2d8d1
commit
8d0dc476c9
1 changed files with 6 additions and 0 deletions
|
@ -568,6 +568,12 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
|
||||||
float_t score = file.read_f32();
|
float_t score = file.read_f32();
|
||||||
uint32_t len = file.read_u32();
|
uint32_t len = file.read_u32();
|
||||||
std::string text = file.read_string(len);
|
std::string text = file.read_string(len);
|
||||||
|
// Special-case handling of <0xXX> single byte tokens.
|
||||||
|
char byte_val;
|
||||||
|
if (sscanf(text.c_str(), "<0x%02hhX>", &byte_val) == 1) {
|
||||||
|
char cstr[2] = { byte_val, 0 };
|
||||||
|
text = cstr;
|
||||||
|
}
|
||||||
vocab->id_to_token[i].text = text;
|
vocab->id_to_token[i].text = text;
|
||||||
vocab->id_to_token[i].score = score;
|
vocab->id_to_token[i].score = score;
|
||||||
vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED;
|
vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue