diff --git a/convert.py b/convert.py index ab4e5d4d6..8cd9ca75c 100755 --- a/convert.py +++ b/convert.py @@ -233,12 +233,7 @@ class SentencePieceVocab: for i in range(tokenizer.vocab_size()): # TODO: How do we want to support is_unknown, is_control, is_byte and is_unused? piece = tokenizer.id_to_piece(i) - text: bytes - if tokenizer.is_unknown(i) or tokenizer.is_control(i) or tokenizer.is_byte(i): - text: bytes = piece.encode("utf-8") - else: - text = piece.replace("\u2581", " ").encode("utf-8") - + text: bytes = piece.encode("utf-8") score: float = tokenizer.get_score(i) yield text, score diff --git a/llama.cpp b/llama.cpp index dc6d11b62..9aa29f457 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1832,13 +1832,13 @@ struct llama_tokenizer { llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {} void tokenize(const std::string & text, std::vector & output) { - // split string into utf8 chars + // split string into utf8 chars / token? int index = 0; size_t offs = 0; while (offs < text.size()) { llama_sp_symbol sym; - // size_t len = utf8_len(text[offs]); - size_t len = llama_trie_find(vocab_.trie, text, offs); + size_t len = utf8_len(text[offs]); + // size_t len = llama_trie_find(vocab_.trie, text, offs); if (len == 0) { len = utf8_len(text[offs]); } diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index 7067b9206..d738bf680 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -5,13 +5,44 @@ #include #include -std::string detokenize(llama_context * ctx, const llama_token * tokens, int count) { +static std::string escape_whitespace(const std::string& text) { + std::string result; + bool escaping = false; + result += char(0xe2); + result += char(0x96); + result += char(0x81); + for (size_t offs = 0; offs < text.length(); ++offs) { + if (text[offs] == ' ' || text[offs] == '\t' || text[offs] == '\n') { + if (!escaping) { + result += char(0xe2); + result += char(0x96); + result += char(0x81); + escaping = true; + } + } + else { + escaping = false; + result += text[offs]; + } + } + return result; +} + +static std::string unescape_whitespace(llama_context* ctx, llama_token token) { + const char* word = llama_token_to_str(ctx, token); + if (strlen(word) >= 3 && + word[0] == char(0xe2) && + word[1] == char(0x96) && + word[2] == char(0x81)) { + return std::string(" ") + (word + 3); + } + return word; +} + +static std::string unescape_whitespace(llama_context* ctx, const llama_token* tokens, int count) { std::string result; for (int i = 0; i < count; ++i) { - result += llama_token_to_str(ctx, tokens[i]); - if (i < count - 1) { - result += "_"; - } + result += unescape_whitespace(ctx, tokens[i]); } return result; } @@ -19,12 +50,14 @@ std::string detokenize(llama_context * ctx, const llama_token * tokens, int coun static const std::map> & k_tests() { static std::map> _k_tests = { - { "Hello World", { 1, 10994, 2787, }, }, - { " Hello World", { 1, 15043, 2787, }, }, - { " Hello World!", { 1, 15043, 2787, 29991, }, }, - { " this is 🦙.cpp", { 1, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, }, - { "w048 7tuijk dsdfhu", { 1, 29893, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, }, - { "нещо на Български", { 1, 821, 4851, 665, 1386, 29713, 1305, }, }, + { "Hello world", { 1, 15043, 3186, }, }, + { " Hello world", { 1, 29871, 15043, 3186, }, }, + { "Hello World", { 1, 15043, 2787, }, }, + { " Hello World", { 1, 29871, 15043, 2787, }, }, + {" Hello World!", { 1, 29871, 15043, 2787, 29991, }, }, + {" this is 🦙.cpp", { 1, 29871, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, }, + {"w048 7tuijk dsdfhu", { 1, 281, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, }, + {"нещо на Български", { 1, 1538, 4851, 665, 1386, 29713, 1305, }, }, }; return _k_tests; }; @@ -77,9 +110,9 @@ int main(int argc, char **argv) { for (const auto & test_kv : k_tests()) { std::vector res(test_kv.first.size()); - const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), int(res.size()), true); + const int n = llama_tokenize(ctx, escape_whitespace(test_kv.first.c_str()).c_str(), res.data(), int(res.size()), true); fprintf(stderr, "%s : '%s' tokenized to '%s'\n", - __func__, test_kv.first.c_str(), detokenize(ctx, res.data(), n).c_str()); + __func__, test_kv.first.c_str(), unescape_whitespace(ctx, res.data(), n).c_str()); res.resize(n); bool correct = res.size() == test_kv.second.size(); diff --git a/tests/test-tokenizer-1.cpp b/tests/test-tokenizer-1.cpp index 456500f9e..632e0525a 100644 --- a/tests/test-tokenizer-1.cpp +++ b/tests/test-tokenizer-1.cpp @@ -8,13 +8,44 @@ #include #include -std::string detokenize(llama_context * ctx, const llama_token * tokens, int count) { +static std::string escape_whitespace(const std::string& text) { + std::string result; + bool escaping = false; + result += char(0xe2); + result += char(0x96); + result += char(0x81); + for (size_t offs = 0; offs < text.length(); ++offs) { + if (text[offs] == ' ' || text[offs] == '\t' || text[offs] == '\n') { + if (!escaping) { + result += char(0xe2); + result += char(0x96); + result += char(0x81); + escaping = true; + } + } + else { + escaping = false; + result += text[offs]; + } + } + return result; +} + +static std::string unescape_whitespace(llama_context* ctx, llama_token token) { + const char* word = llama_token_to_str(ctx, token); + if (strlen(word) >= 3 && + word[0] == char(0xe2) && + word[1] == char(0x96) && + word[2] == char(0x81)) { + return std::string(" ") + (word + 3); + } + return word; +} + +static std::string unescape_whitespace(llama_context* ctx, const llama_token* tokens, int count) { std::string result; for (int i = 0; i < count; ++i) { - result += llama_token_to_str(ctx, tokens[i]); - if (i < count - 1) { - result += "_"; - } + result += unescape_whitespace(ctx, tokens[i]); } return result; } @@ -66,22 +97,22 @@ int main(int argc, char **argv) { } for (int i = 0; i < n_vocab; ++i) { - const char * forward = llama_token_to_str(ctx, i); - std::vector tokens(strlen(forward)); - auto n = llama_tokenize(ctx, forward, tokens.data(), strlen(forward), false); + std::string forward = llama_token_to_str(ctx, i); + std::vector tokens(forward.length()); + int n = llama_tokenize(ctx, forward.c_str(), tokens.data(), forward.length(), false); if (n == 1) { if (i != tokens[0]) { - const char* backward = llama_token_to_str(ctx, tokens[0]); + std::string backward = unescape_whitespace(ctx, tokens[0]); fprintf(stderr, "%s : error: token %d is string %s but tokenize() returns token %d %s\n", - __func__, i, forward, tokens[0], backward); + __func__, i, unescape_whitespace(ctx, i).c_str(), tokens[0], backward.c_str()); } } else { if (i <= 258) { fprintf(stderr, "%s : info: token %d is string %s and tokenize() returns tokens %s\n", - __func__, i, forward, detokenize(ctx, tokens.data(), n).c_str()); + __func__, i, unescape_whitespace(ctx, i).c_str(), unescape_whitespace(ctx, tokens.data(), n).c_str()); } else { fprintf(stderr, "%s : error: token %d is string %s but tokenize() returns tokens %s\n", - __func__, i, forward, detokenize(ctx, tokens.data(), n).c_str()); + __func__, i, unescape_whitespace(ctx, i).c_str(), unescape_whitespace(ctx, tokens.data(), n).c_str()); } } } @@ -91,7 +122,7 @@ int main(int argc, char **argv) { std::wstring wstr(1, ch); std::string str = converter.to_bytes(wstr); std::vector tokens(strlen(str.c_str())); - auto n = llama_tokenize(ctx, str.c_str(), tokens.data(), str.length(), false); + auto n = llama_tokenize(ctx, escape_whitespace(str).c_str(), tokens.data(), str.length(), false); if (n == 1) { fprintf(stderr, "%s : info: %s tokenized to %d \n", __func__, str.c_str(), tokens[0]);