From 1b5442923ac017ee63cd405ffcedc6af1df1aaa0 Mon Sep 17 00:00:00 2001 From: goerch Date: Sun, 6 Aug 2023 07:47:55 +0200 Subject: [PATCH] Fix tokenizer regression in convert.py and improve CPP interface for llama_tokenize --- convert.py | 1 + examples/common.cpp | 11 ------ examples/common.h | 6 --- examples/save-load-state/save-load-state.cpp | 5 +-- .../train-text-from-scratch.cpp | 9 ++--- llama.cpp | 38 ++++++++++++++++++- llama.h | 14 +++++++ tests/test-tokenizer-0.cpp | 10 ++--- tests/test-tokenizer-1.cpp | 18 ++++----- 9 files changed, 69 insertions(+), 43 deletions(-) diff --git a/convert.py b/convert.py index dbd9366b9..d4750df9f 100755 --- a/convert.py +++ b/convert.py @@ -276,6 +276,7 @@ class SentencePieceVocab: piece = tokenizer.id_to_piece(i) text: bytes = piece.encode("utf-8") score: float = tokenizer.get_score(i) + yield text, score def added_tokens(self) -> Iterable[Tuple[bytes, float]]: for text in self.added_tokens_list: diff --git a/examples/common.cpp b/examples/common.cpp index 9f1c0d904..d2fa68ad9 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -608,17 +608,6 @@ std::string gpt_random_prompt(std::mt19937 & rng) { return "The"; } -// TODO: not great allocating this every time -std::vector llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) { - // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars - std::vector res(text.size() + (int) add_bos + 1); - const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos); - assert(n >= 0); - res.resize(n); - - return res; -} - struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) { auto lparams = llama_context_default_params(); diff --git a/examples/common.h b/examples/common.h index 75f622c77..cea1fee02 100644 --- a/examples/common.h +++ b/examples/common.h @@ -102,12 +102,6 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params); std::string gpt_random_prompt(std::mt19937 & rng); -// -// Vocab utils -// - -std::vector llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos); - // // Model utils // diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index d09c27dae..3db61b754 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -44,9 +44,8 @@ int main(int argc, char ** argv) { llama_free_model(model); return 1; } - auto tokens = std::vector(params.n_ctx); - auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true); - + auto tokens = llama_tokenize(ctx, params.prompt.c_str(), true); + auto n_prompt_tokens = tokens.size(); if (n_prompt_tokens < 1) { fprintf(stderr, "%s : failed to tokenize prompt\n", __func__); llama_free(ctx); diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 6e25265d7..d446c6ea5 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -2189,11 +2189,10 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto f.read_raw(buf.data(), f.size); buf[f.size] = '\0'; - out.resize(buf.size()); - - int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false); - if (n_tokens >= 0) { - out.resize(n_tokens); + int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), out.size(), false); + if (n_tokens < 0) { + out.resize(-n_tokens); + llama_tokenize(lctx, buf.data(), out.data(), out.size(), false); } bool verify = false; diff --git a/llama.cpp b/llama.cpp index c5ca25dfc..53f73f6b8 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4057,7 +4057,6 @@ int llama_tokenize_with_model( auto res = llama_tokenize(model->vocab, text, add_bos, true); if (n_max_tokens < (int) res.size()) { - fprintf(stderr, "%s: too many tokens\n", __func__); return -((int) res.size()); } @@ -4077,6 +4076,24 @@ int llama_tokenize( return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos); } +std::vector llama_tokenize( + struct llama_context * ctx, + const std::string & text, + bool add_bos) { + int length = text.length() + add_bos; + std::vector result(length); + length = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos); + if (length < 0) { + result.resize(-length); + int check = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos); + assert(check == -length); + GGML_UNUSED(check); + } else { + result.resize(length); + } + return result; +} + int llama_tokenize_bpe( struct llama_context * ctx, const char * text, @@ -4086,7 +4103,6 @@ int llama_tokenize_bpe( auto res = llama_tokenize(ctx->model.vocab, text, add_bos, false); if (n_max_tokens < (int) res.size()) { - fprintf(stderr, "%s: too many tokens\n", __func__); return -((int) res.size()); } @@ -4097,6 +4113,24 @@ int llama_tokenize_bpe( return res.size(); } +std::vector llama_tokenize_bpe( + struct llama_context * ctx, + const std::string & text, + bool add_bos) { + int length = text.length() + add_bos; + std::vector result(length); + length = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos); + if (length < 0) { + result.resize(-length); + int check = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos); + assert(check == -length); + GGML_UNUSED(check); + } else { + result.resize(length); + } + return result; +} + int llama_n_vocab_from_model(const struct llama_model * model) { return model->vocab.id_to_token.size(); } diff --git a/llama.h b/llama.h index 435ce7ba7..b67867ae3 100644 --- a/llama.h +++ b/llama.h @@ -476,6 +476,20 @@ extern "C" { #include #include +// +// Vocab utils +// + +std::vector llama_tokenize( + struct llama_context * ctx, + const std::string & text, + bool add_bos); + +std::vector llama_tokenize_bpe( + struct llama_context * ctx, + const std::string & text, + bool add_bos); + std::string llama_token_to_str( const struct llama_context * ctx, llama_token token); diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index f8642996a..a523c320c 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -6,9 +6,9 @@ #include #include -static std::string unescape_whitespace(llama_context* ctx, const llama_token* tokens, int count) { +static std::string unescape_whitespace(llama_context* ctx, const std::vector& tokens) { std::string result; - for (int i = 0; i < count; ++i) { + for (int i = 0; i < tokens.size(); ++i) { result += llama_token_to_str(ctx, tokens[i]); } return result; @@ -90,11 +90,9 @@ int main(int argc, char **argv) { } for (const auto & test_kv : k_tests()) { - std::vector res(test_kv.first.size() + 2); - const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), int(res.size()), true); + std::vector res = llama_tokenize(ctx, test_kv.first.c_str(), true); fprintf(stderr, "%s : '%s' tokenized to '%s'\n", - __func__, test_kv.first.c_str(), unescape_whitespace(ctx, res.data(), n).c_str()); - res.resize(n); + __func__, test_kv.first.c_str(), unescape_whitespace(ctx, res).c_str()); bool correct = res.size() == test_kv.second.size(); diff --git a/tests/test-tokenizer-1.cpp b/tests/test-tokenizer-1.cpp index cde7a203b..3803dad8d 100644 --- a/tests/test-tokenizer-1.cpp +++ b/tests/test-tokenizer-1.cpp @@ -28,9 +28,9 @@ static std::string escape_whitespace(const std::string& text) { return result; } -static std::string unescape_whitespace(llama_context* ctx, const llama_token* tokens, int count) { +static std::string unescape_whitespace(llama_context* ctx, const std::vector& tokens) { std::string result; - for (int i = 0; i < count; ++i) { + for (int i = 0; i < tokens.size(); ++i) { result += llama_token_to_str(ctx, tokens[i]); } return result; @@ -84,9 +84,8 @@ int main(int argc, char **argv) { for (int i = 0; i < n_vocab; ++i) { std::string forward = llama_token_to_str_bpe(ctx, i); - std::vector tokens(forward.length()); - int n = llama_tokenize_bpe(ctx, forward.c_str(), tokens.data(), forward.length(), false); - if (n == 1) { + std::vector tokens = llama_tokenize_bpe(ctx, forward, false); + if (tokens.size() == 1) { if (i != tokens[0]) { std::string backward = llama_token_to_str(ctx, tokens[0]); fprintf(stderr, "%s : error: token %d is string %s but bpe returns token %d %s\n", @@ -96,10 +95,10 @@ int main(int argc, char **argv) { } else { if (i <= 258) { fprintf(stderr, "%s : info: token %d is string %s and bpe returns tokens %s\n", - __func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens.data(), n).c_str()); + __func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str()); } else { fprintf(stderr, "%s : error: token %d is string %s but bpe returns tokens %s\n", - __func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens.data(), n).c_str()); + __func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str()); return 3; } } @@ -109,9 +108,8 @@ int main(int argc, char **argv) { for (wchar_t ch = 0x0000; ch < 0xffff; ++ch) { std::wstring wstr(1, ch); std::string str = converter.to_bytes(wstr); - std::vector tokens(str.length() + 1); - auto n = llama_tokenize(ctx, escape_whitespace(str).c_str(), tokens.data(), str.length() + 1, false); - if (n == 1) { + std::vector tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false); + if (tokens.size() == 1) { fprintf(stderr, "%s : info: %s tokenized to %d \n", __func__, str.c_str(), tokens[0]); }