Revert back to C++11, avoid std::string_view in the tokenizer

2023-03-21 11:34:58 +02:00 · 2023-03-21 11:34:58 +02:00 · a0d00bd809
commit a0d00bd809
parent ecd982ddc0
4 changed files with 29 additions and 25 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -221,14 +221,14 @@ add_library(utils OBJECT
            utils.h)

 target_include_directories(utils PUBLIC .)
-target_compile_features(utils PUBLIC cxx_std_11)
+target_compile_features(utils PUBLIC cxx_std_11) # don't bump

 add_library(ggml OBJECT
            ggml.c
            ggml.h)

 target_include_directories(ggml PUBLIC .)
-target_compile_features(ggml PUBLIC c_std_17)
+target_compile_features(ggml PUBLIC c_std_11) # don't bump

 #
 # Linking
--- a/3
+++ b/3
@ -30,8 +30,9 @@ endif
 # Compile flags
 #

+# keep standard at C11 and C++11
 CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
-CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++17 -fPIC
+CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
 LDFLAGS  =

 # OS specific
--- a/utils.cpp
+++ b/utils.cpp
@ -250,7 +250,8 @@ struct llama_sp_symbol {
    using index = int;
    index prev;
    index next;
-    std::string_view text;
+    const char * text;
+    size_t n;
 };

 struct llama_sp_bigram {
@ -267,19 +268,23 @@ struct llama_sp_bigram {
    size_t size;
 };

+// original implementation:
+// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
 struct llama_tokenizer {
    llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {}

-    void tokenize(std::string_view text, std::vector<llama_vocab::id> & output) {
+    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
        // split string into utf8 chars
        int index = 0;
-        while (!text.empty()) {
+        size_t offs = 0;
+        while (offs < text.size()) {
            llama_sp_symbol sym;
-            size_t char_len = std::min(text.size(), utf8_len(text.data()[0]));
-            sym.text = std::string_view(text.data(), char_len);
+            size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
+            sym.text = text.c_str() + offs;
+            sym.n = char_len;
+            offs += char_len;
            sym.prev = index - 1;
-            text.remove_prefix(char_len);
-            sym.next = text.empty() ? -1 : index + 1;
+            sym.next = offs == text.size() ? -1 : index + 1;
            index++;
            symbols_.emplace_back(std::move(sym));
        }
@ -298,14 +303,16 @@ struct llama_tokenizer {
            auto & right_sym = symbols_[bigram.right];

            // if one of the symbols already got merged, skip it.
-            if (left_sym.text.empty() || right_sym.text.empty() ||
-                left_sym.text.size() + right_sym.text.size() != bigram.size) {
+            if (left_sym.n == 0 || right_sym.n == 0 ||
+                left_sym.n + right_sym.n != bigram.size) {
                continue;
            }

            // merge the right sym into the left one
-            left_sym.text = std::string_view(left_sym.text.data(), left_sym.text.size() + right_sym.text.size());
-            right_sym.text = std::string_view("");
+            left_sym.n += right_sym.n;
+            right_sym.n = 0;
+
+            //printf("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);

            // remove the right sym from the chain
            left_sym.next = right_sym.next;
@ -320,11 +327,11 @@ struct llama_tokenizer {

        for (int i = 0; i != -1; i = symbols_[i].next) {
            auto & symbol = symbols_[i];
-            auto token = vocab_.token_to_id.find(std::string(symbol.text));
+            auto token = vocab_.token_to_id.find(std::string(symbol.text, symbol.n));

            if (token == vocab_.token_to_id.end()) {
                // output any symbols that did not form tokens as bytes.
-                for (int j = 0; j < symbol.text.size(); ++j) {
+                for (int j = 0; j < (int) symbol.n; ++j) {
                    llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
                    output.push_back(token_id);
                }
@ -340,8 +347,8 @@ private:
            return;
        }

-        std::string_view text(symbols_[left].text.data(), symbols_[left].text.size() + symbols_[right].text.size());
-        auto token = vocab_.token_to_id.find(std::string(text));
+        const std::string text = std::string(symbols_[left].text, symbols_[left].n + symbols_[right].n);
+        auto token = vocab_.token_to_id.find(text);

        if (token == vocab_.token_to_id.end()) {
            return;
@ -399,16 +406,12 @@ bool llama_vocab_load(const std::string & fname, llama_vocab & vocab) {
        vocab.token_to_id[word] = i;
        vocab.id_to_token[i] = word;
        vocab.score[i] = score;
-
-        //if (i < 30000) {
-        //    fprintf(stderr, "%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
-        //}
    }

    return true;
 }

-std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, std::string_view text, bool bos) {
+std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos) {
    llama_tokenizer tokenizer(vocab);
    std::vector<llama_vocab::id> output;

--- a/utils.h
+++ b/utils.h
@ -79,7 +79,7 @@ bool llama_vocab_load(const std::string & fname, llama_vocab & vocab);

 // TODO: this is probably wrong, but I cannot figure out how this tokenizer works ..
 // ref: https://github.com/google/sentencepiece
-std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, std::string_view text, bool bos);
+std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos);

 // sample next token given probabilities for each embedding
 //