Use sentencepiece tokenization

2023-03-08 16:44:50 -05:00 · 2023-03-08 16:44:50 -05:00 · 67b1c842d9
commit 67b1c842d9
parent 96dc6a0c68
6 changed files with 29 additions and 15 deletions
--- a/.gitignore
+++ b/.gitignore
@ -21,3 +21,4 @@ models/*

 arm_neon.h
 compile_commands.json
+deps
--- a/4
+++ b/4
@ -31,7 +31,7 @@ endif
 #

 CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
-CXXFLAGS = -I. -I../../sentencepiece/src/ -O3 -DNDEBUG -std=c++11 -fPIC
+CXXFLAGS = -I. -Ideps/sentencepiece-0.1.97/src/ -O3 -DNDEBUG -std=c++11 -fPIC
 LDFLAGS  = 
 # OS specific
 # TODO: support Windows
@ -187,7 +187,7 @@ clean:
 	rm -f *.o main quantize

 main: main.cpp ggml.o utils.o
-	$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o /Users/billhamilton/src/sentencepiece/build/src/libsentencepiece.a -o main $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o deps/libsentencepiece.a -o main $(LDFLAGS)
 	./main -h

 quantize: quantize.cpp ggml.o utils.o
--- a/README.md
+++ b/README.md
@ -132,7 +132,7 @@ Here are the step for the LLaMA-7B model:
 # build this repo
 git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
-make
+./build.sh

 # obtain the original LLaMA model weights and place them in ./models
 ls ./models
--- a/build.sh
+++ b/build.sh
@ -0,0 +1,21 @@
+#!/bin/sh
+
+if [ ! -d deps ]
+then
+    mkdir deps
+fi
+cd deps
+if [ ! -f v0.1.97.tar.gz ]
+then
+    curl -LO https://github.com/google/sentencepiece/archive/refs/tags/v0.1.97.tar.gz
+fi
+if [ ! -f libsentencepiece.a ]
+then
+    tar xzvf v0.1.97.tar.gz
+    cd sentencepiece-0.1.97/ && rm -rf build && mkdir build && cd build && cmake ..
+    make sentencepiece-static -j $(nproc)
+    cd ../..
+    cp sentencepiece-0.1.97/build/src/libsentencepiece.a ./
+fi
+cd ..
+make
--- a/build_deps.sh
+++ b/build_deps.sh
@ -1,12 +0,0 @@
-#https://github.com/google/sentencepiece.git
-#9ffb33a14c97c512103be0ee74740099660b39aa
-
-curl -LO https://github.com/google/sentencepiece/releases/download/v0.1.97/sentencepiece-0.1.97.tar.gz
-tar xzvf sentencepiece-0.1.97.tar.gz
-cd sentencepiece-0.1.97/src
-mkdir build
-cd build
-cmake ..
-make sentencepiece-static -j $(nproc)
-cd ../..
-
--- a/main.cpp
+++ b/main.cpp
@ -855,6 +855,8 @@ int main(int argc, char ** argv) {
    printf("\n\n");

    std::vector<gpt_vocab::id> embd;
+    std::vector<gpt_vocab::id> all_tokens;
+    std::string full_text = "";

    // determine the required inference memory per token:
    size_t mem_per_token = 0;
@ -920,6 +922,7 @@ int main(int argc, char ** argv) {

                last_n_tokens.erase(last_n_tokens.begin());
                last_n_tokens.push_back(id);
+                all_tokens.push_back(id);

                t_sample_us += ggml_time_us() - t_start_sample_us;
            }
@ -938,6 +941,7 @@ int main(int argc, char ** argv) {
                embd.push_back(embd_inp[input_consumed]);
                last_n_tokens.erase(last_n_tokens.begin());
                last_n_tokens.push_back(embd_inp[input_consumed]);
+                all_tokens.push_back(embd_inp[input_consumed]);
                ++input_consumed;
                if (embd.size() > params.n_batch) {
                    break;