More testing of the tokenizer

Adding @howard0su 's draft PR and prefix matching.
2023-07-22 18:37:58 +02:00 · 2023-07-22 18:37:58 +02:00 · 94a0ee1eb8
commit 94a0ee1eb8
parent c8ae81756c
3 changed files with 88 additions and 17 deletions
--- a/convert.py
+++ b/convert.py
@ -233,7 +233,11 @@ class SentencePieceVocab:
        for i in range(tokenizer.vocab_size()):
            # TODO: How do we want to support is_unknown, is_control, is_byte and is_unused?
            piece = tokenizer.id_to_piece(i)
-            text: bytes = piece.encode("utf-8")
+            text: bytes
+            if tokenizer.is_unknown(i) or tokenizer.is_control(i) or tokenizer.is_byte(i):
+                text: bytes = piece.encode("utf-8")
+            else:
+                text = piece.replace("\u2581", " ").encode("utf-8")

            score: float = tokenizer.get_score(i)
            yield text, score
--- a/llama.cpp
+++ b/llama.cpp
@ -242,6 +242,13 @@ struct llama_kv_cache {
    }
 };

+struct llama_trie {
+    std::unordered_map<std::string, llama_trie> map;
+};
+
+void llama_trie_insert(struct llama_trie& trie, const std::string& text, size_t offs);
+size_t llama_trie_find(const struct llama_trie& trie, const std::string& text, size_t offs);
+
 struct llama_vocab {
    using id    = int32_t;
    using token = std::string;
@ -253,6 +260,7 @@ struct llama_vocab {

    std::unordered_map<token, id> token_to_id;
    std::vector<token_score> id_to_token;
+    struct llama_trie trie;
 };

 struct llama_model {
@ -519,8 +527,10 @@ struct llama_file_loader {
            vocab.token_to_id[word] = i;

            auto & tok_score = vocab.id_to_token[i];
-            tok_score.tok = std::move(word);
+            tok_score.tok = word;
            tok_score.score = score;
+
+            llama_trie_insert(vocab.trie, word, 0);
        }
    }
    void read_tensor_metadata(llama_load_tensors_map & tensors_map) {
@ -1794,6 +1804,28 @@ struct llama_sp_bigram {
    size_t size;
 };

+void llama_trie_insert(struct llama_trie& trie, const std::string& text, size_t offs) {
+    if (offs < text.size()) {
+        size_t char_len = utf8_len(text[offs]);
+        std::string key = text.substr(offs, char_len);
+        if (trie.map.find(key) == trie.map.end()) {
+            trie.map[key] = llama_trie();
+        }
+        llama_trie_insert(trie.map.at(key), text, offs + char_len);
+    }
+}
+
+size_t llama_trie_find(const struct llama_trie& trie, const std::string & text, size_t offs) {
+    if (offs < text.size()) {
+        size_t char_len = utf8_len(text[offs]);
+        std::string key = text.substr(offs, char_len);
+        if (trie.map.find(key) != trie.map.end()) {
+            return char_len + llama_trie_find(trie.map.at(key), text, offs + char_len);
+        }
+    }
+    return 0;
+}
+
 // original implementation:
 // https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
 struct llama_tokenizer {
@ -1805,11 +1837,14 @@ struct llama_tokenizer {
        size_t offs = 0;
        while (offs < text.size()) {
            llama_sp_symbol sym;
-            assert(utf8_len(text[offs]) <= text.size() - offs);
-            size_t char_len = utf8_len(text[offs]);
+            // size_t len = utf8_len(text[offs]);
+            size_t len = llama_trie_find(vocab_.trie, text, offs);
+            if (len == 0) {
+                len = utf8_len(text[offs]);
+            }
            sym.text = text.c_str() + offs;
-            sym.n = char_len;
-            offs += char_len;
+            sym.n = len;
+            offs += len;
            sym.prev = index - 1;
            sym.next = offs == text.size() ? -1 : index + 1;
            index++;
@ -1854,21 +1889,36 @@ struct llama_tokenizer {

        for (int i = 0; i != -1; i = symbols_[i].next) {
            auto & symbol = symbols_[i];
-            auto token = vocab_.token_to_id.find(std::string(symbol.text, symbol.n));
-
-            if (token == vocab_.token_to_id.end()) {
-                // output any symbols that did not form tokens as bytes.
-                for (int j = 0; j < (int) symbol.n; ++j) {
-                    llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
-                    output.push_back(token_id);
-                }
-            } else {
-                output.push_back((*token).second);
-            }
+            resegment(symbol, output);
        }
    }

 private:
+    void resegment(llama_sp_symbol &symbol, std::vector<llama_vocab::id> &output) {
+        auto text = std::string(symbol.text, symbol.n);
+        auto token = vocab_.token_to_id.find(text);
+
+        // Do we need to support is_unused?
+        if (token != vocab_.token_to_id.end()) {
+            output.push_back((*token).second);
+            return;
+        }
+
+        const auto p = rev_merge.find(text);
+
+        if (p == rev_merge.end()) {
+            // output any symbols that did not form tokens as bytes.
+            for (int j = 0; j < (int) symbol.n; ++j) {
+                llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
+                output.push_back(token_id);
+            }
+            return;
+        }
+
+        resegment(symbols_[p->second.first], output);
+        resegment(symbols_[p->second.second], output);
+    }
+
    void try_add_bigram(int left, int right) {
        if (left == -1 || right == -1) {
            return;
@ -1893,11 +1943,15 @@ private:
        bigram.score = tok_score.score;
        bigram.size = text.size();
        work_queue_.push(bigram);
+
+        // Do we need to support is_unused?
+        rev_merge[text] = std::make_pair(left, right);
    }

    const llama_vocab & vocab_;
    std::vector<llama_sp_symbol> symbols_;
    llama_sp_bigram::queue work_queue_;
+    std::map<std::string, std::pair<int, int> > rev_merge;
 };

 static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos) {
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@ -5,6 +5,17 @@
 #include <map>
 #include <vector>

+std::string detokenize(llama_context * ctx, const llama_token * tokens, int count) {
+    std::string result;
+    for (int i = 0; i < count; ++i) {
+        result += llama_token_to_str(ctx, tokens[i]);
+        if (i < count - 1) {
+            result += "_";
+        }
+    }
+    return result;
+}
+
 static const std::map<std::string, std::vector<llama_token>> & k_tests()
 {
    static std::map<std::string, std::vector<llama_token>> _k_tests = {
@ -67,6 +78,8 @@ int main(int argc, char **argv) {
    for (const auto & test_kv : k_tests()) {
        std::vector<llama_token> res(test_kv.first.size());
        const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), int(res.size()), true);
+        fprintf(stderr, "%s : '%s' tokenized to '%s'\n", 
+            __func__, test_kv.first.c_str(), detokenize(ctx, res.data(), n).c_str());
        res.resize(n);

        bool correct = res.size() == test_kv.second.size();