work towards tokenizer integration

2023-03-08 16:44:50 -05:00 · 2023-03-08 16:44:50 -05:00 · 96dc6a0c68
commit 96dc6a0c68
parent c80e2a8f2a
3 changed files with 41 additions and 12 deletions
--- a/7
+++ b/7
@ -31,9 +31,8 @@ endif
 #
 CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
-CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
+CXXFLAGS = -I. -I../../sentencepiece/src/ -O3 -DNDEBUG -std=c++11 -fPIC
-LDFLAGS  =
+LDFLAGS  = 
 # OS specific
 # TODO: support Windows
 ifeq ($(UNAME_S),Linux)
@ -188,7 +187,7 @@ clean:
 	rm -f *.o main quantize
 main: main.cpp ggml.o utils.o
-	$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o -o main $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o /Users/billhamilton/src/sentencepiece/build/src/libsentencepiece.a -o main $(LDFLAGS)
 	./main -h
 quantize: quantize.cpp ggml.o utils.o
--- a/build_deps.sh
+++ b/build_deps.sh
@ -0,0 +1,12 @@
 #https://github.com/google/sentencepiece.git
 #9ffb33a14c97c512103be0ee74740099660b39aa
 curl -LO https://github.com/google/sentencepiece/releases/download/v0.1.97/sentencepiece-0.1.97.tar.gz
 tar xzvf sentencepiece-0.1.97.tar.gz
 cd sentencepiece-0.1.97/src
 mkdir build
 cd build
 cmake ..
 make sentencepiece-static -j $(nproc)
 cd ../..
--- a/main.cpp
+++ b/main.cpp
@ -14,6 +14,12 @@
 #include <signal.h>
 #include <unistd.h>
 #include <sentencepiece_processor.h>
 //Tokenizer object
 sentencepiece::SentencePieceProcessor processor;
 #define ANSI_COLOR_RED     "\x1b[31m"
 #define ANSI_COLOR_GREEN   "\x1b[32m"
 #define ANSI_COLOR_YELLOW  "\x1b[33m"
@ -758,6 +764,11 @@ void sigint_handler(int signo) {
 }
 int main(int argc, char ** argv) {
    const auto status = processor.Load("models/tokenizer.model");
    if (!status.ok()) {
       printf("%s", status.ToString().c_str());
       // error
    }
    ggml_time_init();
    const int64_t t_main_start_us = ggml_time_us();
@ -807,7 +818,8 @@ int main(int argc, char ** argv) {
    std::vector<float> logits;
    // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
+    std::vector<gpt_vocab::id> embd_inp;
    processor.Encode(params.prompt, &embd_inp);
    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
@ -935,14 +947,20 @@ int main(int argc, char ** argv) {
        // display text
        if (!input_noecho) {
-            for (auto id : embd) {
+            std::string check = processor.IdToPiece(all_tokens.at(all_tokens.size()-1));
-                printf("%s", vocab.id_to_token[id].c_str());
+            if(check != "<EFBFBD>") {  // ensure a multi-byte token is finished generating before outputting the text
                std::string text;
                processor.Decode(all_tokens, &text);
                std::string chunk = text.substr(full_text.length());
                printf("%s", chunk.c_str());
                full_text += chunk;
                // reset color to default if we there is no pending user input
                if (params.use_color && embd_inp.size() <= input_consumed) {
                    printf(ANSI_COLOR_RESET);
                }
                fflush(stdout);
            }
            // reset color to default if we there is no pending user input
            if (params.use_color && embd_inp.size() <= input_consumed) {
                printf(ANSI_COLOR_RESET);
            }
            fflush(stdout);
        }
        // in interactive mode, and not currently processing queued inputs;