Added whitespace escaping and unescaping

Now we see some resemblence to the Meta-Tokenizer, I think. Only problem: how to integrate this into `llama.cpp` kernel.
2023-07-22 22:24:21 +02:00 · 2023-07-22 22:24:21 +02:00 · 0e74a7222e
commit 0e74a7222e
parent 94a0ee1eb8
4 changed files with 94 additions and 35 deletions
--- a/convert.py
+++ b/convert.py
@ -233,12 +233,7 @@ class SentencePieceVocab:
        for i in range(tokenizer.vocab_size()):
            # TODO: How do we want to support is_unknown, is_control, is_byte and is_unused?
            piece = tokenizer.id_to_piece(i)
-            text: bytes
-            if tokenizer.is_unknown(i) or tokenizer.is_control(i) or tokenizer.is_byte(i):
-                text: bytes = piece.encode("utf-8")
-            else:
-                text = piece.replace("\u2581", " ").encode("utf-8")
-
+            text: bytes = piece.encode("utf-8")
            score: float = tokenizer.get_score(i)
            yield text, score

--- a/llama.cpp
+++ b/llama.cpp
@ -1832,13 +1832,13 @@ struct llama_tokenizer {
    llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {}

    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
-        // split string into utf8 chars
+        // split string into utf8 chars / token?
        int index = 0;
        size_t offs = 0;
        while (offs < text.size()) {
            llama_sp_symbol sym;
-            // size_t len = utf8_len(text[offs]);
-            size_t len = llama_trie_find(vocab_.trie, text, offs);
+            size_t len = utf8_len(text[offs]);
+            // size_t len = llama_trie_find(vocab_.trie, text, offs);
            if (len == 0) {
                len = utf8_len(text[offs]);
            }
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@ -5,13 +5,44 @@
 #include <map>
 #include <vector>

-std::string detokenize(llama_context * ctx, const llama_token * tokens, int count) {
+static std::string escape_whitespace(const std::string& text) {
+    std::string result;
+    bool escaping = false;
+    result += char(0xe2);
+    result += char(0x96);
+    result += char(0x81);
+    for (size_t offs = 0; offs < text.length(); ++offs) {
+        if (text[offs] == ' ' || text[offs] == '\t' || text[offs] == '\n') {
+            if (!escaping) {
+                result += char(0xe2);
+                result += char(0x96);
+                result += char(0x81);
+                escaping = true;
+            }
+        }
+        else {
+            escaping = false;
+            result += text[offs];
+        }
+    }
+    return result;
+}
+
+static std::string unescape_whitespace(llama_context* ctx, llama_token token) {
+    const char* word = llama_token_to_str(ctx, token);
+    if (strlen(word) >= 3 &&
+        word[0] == char(0xe2) &&
+        word[1] == char(0x96) &&
+        word[2] == char(0x81)) {
+        return std::string(" ") + (word + 3);
+    } 
+    return word;
+}
+
+static std::string unescape_whitespace(llama_context* ctx, const llama_token* tokens, int count) {
    std::string result;
    for (int i = 0; i < count; ++i) {
-        result += llama_token_to_str(ctx, tokens[i]);
-        if (i < count - 1) {
-            result += "_";
-        }
+        result += unescape_whitespace(ctx, tokens[i]);
    }
    return result;
 }
@ -19,12 +50,14 @@ std::string detokenize(llama_context * ctx, const llama_token * tokens, int coun
 static const std::map<std::string, std::vector<llama_token>> & k_tests()
 {
    static std::map<std::string, std::vector<llama_token>> _k_tests = {
-        { "Hello World",        { 1,  10994,   2787, }, },
-        { " Hello World",       { 1,  15043,   2787, }, },
-        { " Hello World!",      { 1,  15043,   2787,  29991, }, },
-        { " this is 🦙.cpp",    { 1,    445,    338,  29871,    243,    162,    169,    156,  29889,   8223, }, },
-        { "w048 7tuijk dsdfhu", { 1,  29893,  29900,  29946,  29947,  29871,  29955,   9161,  13535,  18031,   2176,   6905, }, },
-        { "нещо на Български",  { 1,    821,   4851,    665,   1386,  29713,   1305, }, },
+        { "Hello world",        { 1,  15043,   3186, }, },
+        { " Hello world",       { 1,  29871,  15043,   3186, }, },
+        { "Hello World",        { 1,  15043,   2787, }, },
+        { " Hello World",       { 1,  29871,  15043,   2787, }, },
+        {" Hello World!",       { 1,  29871,  15043,   2787,  29991, }, },
+        {" this is 🦙.cpp",    { 1,  29871,    445,    338,  29871,    243,    162,    169,    156,  29889,   8223, }, },
+        {"w048 7tuijk dsdfhu",  { 1,    281,  29900,  29946,  29947,  29871,  29955,   9161,  13535,  18031,   2176,   6905, }, },
+        {"нещо на Български",   { 1,   1538,   4851,    665,   1386,  29713,   1305, }, },
    };
    return _k_tests;
 };
@ -77,9 +110,9 @@ int main(int argc, char **argv) {

    for (const auto & test_kv : k_tests()) {
        std::vector<llama_token> res(test_kv.first.size());
-        const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), int(res.size()), true);
+        const int n = llama_tokenize(ctx, escape_whitespace(test_kv.first.c_str()).c_str(), res.data(), int(res.size()), true);
        fprintf(stderr, "%s : '%s' tokenized to '%s'\n", 
-            __func__, test_kv.first.c_str(), detokenize(ctx, res.data(), n).c_str());
+            __func__, test_kv.first.c_str(), unescape_whitespace(ctx, res.data(), n).c_str());
        res.resize(n);

        bool correct = res.size() == test_kv.second.size();
--- a/tests/test-tokenizer-1.cpp
+++ b/tests/test-tokenizer-1.cpp
@ -8,13 +8,44 @@
 #include <map>
 #include <vector>

-std::string detokenize(llama_context * ctx, const llama_token * tokens, int count) {
+static std::string escape_whitespace(const std::string& text) {
+    std::string result;
+    bool escaping = false;
+    result += char(0xe2);
+    result += char(0x96);
+    result += char(0x81);
+    for (size_t offs = 0; offs < text.length(); ++offs) {
+        if (text[offs] == ' ' || text[offs] == '\t' || text[offs] == '\n') {
+            if (!escaping) {
+                result += char(0xe2);
+                result += char(0x96);
+                result += char(0x81);
+                escaping = true;
+            }
+        }
+        else {
+            escaping = false;
+            result += text[offs];
+        }
+    }
+    return result;
+}
+
+static std::string unescape_whitespace(llama_context* ctx, llama_token token) {
+    const char* word = llama_token_to_str(ctx, token);
+    if (strlen(word) >= 3 &&
+        word[0] == char(0xe2) &&
+        word[1] == char(0x96) &&
+        word[2] == char(0x81)) {
+        return std::string(" ") + (word + 3);
+    }
+    return word;
+}
+
+static std::string unescape_whitespace(llama_context* ctx, const llama_token* tokens, int count) {
    std::string result;
    for (int i = 0; i < count; ++i) {
-        result += llama_token_to_str(ctx, tokens[i]);
-        if (i < count - 1) {
-            result += "_";
-        }
+        result += unescape_whitespace(ctx, tokens[i]);
    }
    return result;
 }
@ -66,22 +97,22 @@ int main(int argc, char **argv) {
    }

    for (int i = 0; i < n_vocab; ++i) {
-        const char * forward = llama_token_to_str(ctx, i);
-        std::vector<llama_token> tokens(strlen(forward));
-        auto n = llama_tokenize(ctx, forward, tokens.data(), strlen(forward), false);
+        std::string forward = llama_token_to_str(ctx, i);
+        std::vector<llama_token> tokens(forward.length());
+        int n = llama_tokenize(ctx, forward.c_str(), tokens.data(), forward.length(), false);
        if (n == 1) {
            if (i != tokens[0]) {
-                const char* backward = llama_token_to_str(ctx, tokens[0]);
+                std::string backward = unescape_whitespace(ctx, tokens[0]);
                fprintf(stderr, "%s : error: token %d is string %s but tokenize() returns token %d %s\n", 
-                    __func__, i, forward, tokens[0], backward);
+                    __func__, i, unescape_whitespace(ctx, i).c_str(), tokens[0], backward.c_str());
            }
        } else {
            if (i <= 258) {
                fprintf(stderr, "%s : info: token %d is string %s and tokenize() returns tokens %s\n", 
-                    __func__, i, forward, detokenize(ctx, tokens.data(), n).c_str());
+                    __func__, i, unescape_whitespace(ctx, i).c_str(), unescape_whitespace(ctx, tokens.data(), n).c_str());
            } else {
                fprintf(stderr, "%s : error: token %d is string %s but tokenize() returns tokens %s\n", 
-                    __func__, i, forward, detokenize(ctx, tokens.data(), n).c_str());
+                    __func__, i, unescape_whitespace(ctx, i).c_str(), unescape_whitespace(ctx, tokens.data(), n).c_str());
            }
        }
    }
@ -91,7 +122,7 @@ int main(int argc, char **argv) {
        std::wstring wstr(1, ch);
        std::string str = converter.to_bytes(wstr);
        std::vector<llama_token> tokens(strlen(str.c_str()));
-        auto n = llama_tokenize(ctx, str.c_str(), tokens.data(), str.length(), false);
+        auto n = llama_tokenize(ctx, escape_whitespace(str).c_str(), tokens.data(), str.length(), false);
        if (n == 1) {
            fprintf(stderr, "%s : info: %s tokenized to %d \n", 
                __func__, str.c_str(), tokens[0]);