From 17ca832717d81a40cc4bb54c00cfbf920e7f25c6 Mon Sep 17 00:00:00 2001 From: goerch Date: Tue, 19 Sep 2023 19:04:48 +0200 Subject: [PATCH] Streamlining code and adding some more assertions Important change: I'm classifying added tokens as control tokens now for BPE. --- convert-falcon-hf-to-gguf.py | 8 ++--- convert.py | 10 ++---- llama.cpp | 66 +++++++++++++++++++--------------- models/ggml-vocab-aquila.gguf | Bin 4825676 -> 4825676 bytes models/ggml-vocab-falcon.gguf | Bin 2547782 -> 2547782 bytes 5 files changed, 42 insertions(+), 42 deletions(-) diff --git a/convert-falcon-hf-to-gguf.py b/convert-falcon-hf-to-gguf.py index 8de3f3126..c8d36ee38 100755 --- a/convert-falcon-hf-to-gguf.py +++ b/convert-falcon-hf-to-gguf.py @@ -161,13 +161,9 @@ byte_encoder = bytes_to_unicode() byte_decoder = {v: k for k, v in byte_encoder.items()} for i in range(vocab_size): - text = reverse_vocab[i] - tokens.append(text) + tokens.append(reverse_vocab[i]) scores.append(0.0) # dummy - if text in byte_decoder: - toktypes.append(gguf.TokenType.BYTE) - else: - toktypes.append(gguf.TokenType.NORMAL) + toktypes.append(gguf.TokenType.NORMAL) gguf_writer.add_token_list(tokens) gguf_writer.add_token_scores(scores) diff --git a/convert.py b/convert.py index f55afdcd3..7b9ae45a8 100755 --- a/convert.py +++ b/convert.py @@ -343,19 +343,13 @@ class BpeVocab: byte_encoder = tokenization_gpt2.bytes_to_unicode() byte_decoder = {v: k for k, v in byte_encoder.items()} - score = 0.0 for i, _ in enumerate(tokenizer): - text = reverse_vocab[i] - if text in byte_decoder: - toktype = gguf.TokenType.BYTE - else: - toktype = gguf.TokenType.NORMAL - yield text, score, toktype + yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: for text in self.added_tokens_list: score = -1000.0 - yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED + yield text.encode("utf-8"), score, gguf.TokenType.CONTROL def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: yield from self.bpe_tokens() diff --git a/llama.cpp b/llama.cpp index 74f40497d..c6327af62 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3884,6 +3884,10 @@ static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) { return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE; } +static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) { + return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED; +} + static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) { GGML_ASSERT(llama_is_byte_token(vocab, id)); const auto& token_data = vocab.id_to_token.at(id); @@ -7224,47 +7228,53 @@ static std::string llama_decode_text(const std::string& text) { // does not write null-terminator to buf int llama_token_to_piece_with_model(const struct llama_model * model, llama_token token, char * buf, int length) { if (0 <= token && token < llama_model_n_vocab(model)) { - if (llama_is_normal_token(model->vocab, token)) { - std::string result = model->vocab.id_to_token[token].text; - if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM) { + switch (llama_vocab_get_type(model->vocab)) { + case LLAMA_VOCAB_TYPE_SPM: { + if (llama_is_normal_token(model->vocab, token)) { + std::string result = model->vocab.id_to_token[token].text; llama_unescape_whitespace(result); - } else if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_BPE) { - result = llama_decode_text(result); - } else { - GGML_ASSERT(false); - } - if (length < (int) result.length()) { - return -result.length(); - } - memcpy(buf, result.c_str(), result.length()); - return result.length(); - } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT - if (length < 3) { - return -3; - } - buf[0] = '\xe2'; - buf[1] = '\x96'; - buf[2] = '\x85'; - return 3; - } else if (llama_is_control_token(model->vocab, token)) { - ; - } else if (llama_is_byte_token(model->vocab, token)) { - if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM) { + if (length < (int) result.length()) { + return -result.length(); + } + memcpy(buf, result.c_str(), result.length()); + return result.length(); + } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT + if (length < 3) { + return -3; + } + memcpy(buf, "\xe2\x96\x85", 3); + return 3; + } else if (llama_is_control_token(model->vocab, token)) { + ; + } else if (llama_is_byte_token(model->vocab, token)) { if (length < 1) { return -1; } buf[0] = llama_token_to_byte(model->vocab, token); return 1; - } else if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_BPE) { - std::string result = llama_decode_text(model->vocab.id_to_token[token].text); - if (length < (int)result.length()) { + } else { + GGML_ASSERT(false); + } + break; + } + case LLAMA_VOCAB_TYPE_BPE: { + if (llama_is_normal_token(model->vocab, token)) { + std::string result = model->vocab.id_to_token[token].text; + result = llama_decode_text(result); + if (length < (int) result.length()) { return -result.length(); } memcpy(buf, result.c_str(), result.length()); return result.length(); + } else if (llama_is_control_token(model->vocab, token)) { + ; } else { GGML_ASSERT(false); } + break; + } + default: + GGML_ASSERT(false); } } return 0; diff --git a/models/ggml-vocab-aquila.gguf b/models/ggml-vocab-aquila.gguf index 5ffc9c3cf80e9d1f169744853095caa32f092751..7a9abb122ddd18706100e155535f29120ca7ebc4 100644 GIT binary patch delta 286 zcmaLG*G&Qe0D$46C>HcA2xmt*8>naRSP^vx2jG#!1$_7u25<)_Fu4VIWEN-epTIBq z2L12sz@KEuk|WO$!xR`{lrhFBGQlJvQ%p0%EG6bBQz2rW1r}LinH5&4vc@{KpZLAh z>qN0Pyy-1(dnfHg^=>)sMsYD;%>_a5->=_Hr57f3H@xdT?>q5<4}IigpZL^gKKF$$ eedTN4xanKp`Q8tH^pjhDcH1w0O~d4^`SAyjKu4zl delta 1310 zcmeIxw^0KD5QX8BkPgpb4hb&-!U=PDFTCx6L^V*r5&SbMfF7uT88twtTBw14D1h`5 zf-iovZ+AOiANh{n$dRXu0!2c)>7kcC`Wax5AtHtuVU#gq#wk%|f=Q;BW`fsphF$$SSLEwnaYu9`)3Y IlIOISztQ07cON;)v6C1P2@wH^Xx1EndPRtW8(|39S`9D}jQ)w8TSWBNXi delta 1048 zcmeIvI}U