work towards tokenizer integration

2023-03-08 16:44:50 -05:00 · 2023-03-08 16:44:50 -05:00 · 96dc6a0c68
commit 96dc6a0c68
parent c80e2a8f2a
3 changed files with 41 additions and 12 deletions
--- a/7
+++ b/7
@ -31,9 +31,8 @@ endif
 #

 CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
-CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
-LDFLAGS  =
-
+CXXFLAGS = -I. -I../../sentencepiece/src/ -O3 -DNDEBUG -std=c++11 -fPIC
+LDFLAGS  = 
 # OS specific
 # TODO: support Windows
 ifeq ($(UNAME_S),Linux)
@ -188,7 +187,7 @@ clean:
 	rm -f *.o main quantize

 main: main.cpp ggml.o utils.o
-	$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o -o main $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o /Users/billhamilton/src/sentencepiece/build/src/libsentencepiece.a -o main $(LDFLAGS)
 	./main -h

 quantize: quantize.cpp ggml.o utils.o
--- a/build_deps.sh
+++ b/build_deps.sh
@ -0,0 +1,12 @@
+#https://github.com/google/sentencepiece.git
+#9ffb33a14c97c512103be0ee74740099660b39aa
+
+curl -LO https://github.com/google/sentencepiece/releases/download/v0.1.97/sentencepiece-0.1.97.tar.gz
+tar xzvf sentencepiece-0.1.97.tar.gz
+cd sentencepiece-0.1.97/src
+mkdir build
+cd build
+cmake ..
+make sentencepiece-static -j $(nproc)
+cd ../..
+
--- a/main.cpp
+++ b/main.cpp
@ -14,6 +14,12 @@
 #include <signal.h>
 #include <unistd.h>

+#include <sentencepiece_processor.h>
+
+
+//Tokenizer object
+sentencepiece::SentencePieceProcessor processor;
+
 #define ANSI_COLOR_RED     "\x1b[31m"
 #define ANSI_COLOR_GREEN   "\x1b[32m"
 #define ANSI_COLOR_YELLOW  "\x1b[33m"
@ -758,6 +764,11 @@ void sigint_handler(int signo) {
 }

 int main(int argc, char ** argv) {
+    const auto status = processor.Load("models/tokenizer.model");
+    if (!status.ok()) {
+       printf("%s", status.ToString().c_str());
+       // error
+    }
    ggml_time_init();
    const int64_t t_main_start_us = ggml_time_us();

@ -807,7 +818,8 @@ int main(int argc, char ** argv) {
    std::vector<float> logits;

    // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
+    std::vector<gpt_vocab::id> embd_inp;
+    processor.Encode(params.prompt, &embd_inp);

    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());

@ -935,14 +947,20 @@ int main(int argc, char ** argv) {

        // display text
        if (!input_noecho) {
-            for (auto id : embd) {
-                printf("%s", vocab.id_to_token[id].c_str());
+            std::string check = processor.IdToPiece(all_tokens.at(all_tokens.size()-1));
+            if(check != "<EFBFBD>") {  // ensure a multi-byte token is finished generating before outputting the text
+                std::string text;
+                processor.Decode(all_tokens, &text);
+                std::string chunk = text.substr(full_text.length());
+                printf("%s", chunk.c_str());
+                full_text += chunk;
+
+                // reset color to default if we there is no pending user input
+                if (params.use_color && embd_inp.size() <= input_consumed) {
+                    printf(ANSI_COLOR_RESET);
+                }
+                fflush(stdout);
            }
-            // reset color to default if we there is no pending user input
-            if (params.use_color && embd_inp.size() <= input_consumed) {
-                printf(ANSI_COLOR_RESET);
-            }
-            fflush(stdout);
        }

        // in interactive mode, and not currently processing queued inputs;