Use sentencepiece tokenization
This commit is contained in:
parent
96dc6a0c68
commit
67b1c842d9
6 changed files with 29 additions and 15 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -21,3 +21,4 @@ models/*
|
|||
|
||||
arm_neon.h
|
||||
compile_commands.json
|
||||
deps
|
||||
|
|
4
Makefile
4
Makefile
|
@ -31,7 +31,7 @@ endif
|
|||
#
|
||||
|
||||
CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
|
||||
CXXFLAGS = -I. -I../../sentencepiece/src/ -O3 -DNDEBUG -std=c++11 -fPIC
|
||||
CXXFLAGS = -I. -Ideps/sentencepiece-0.1.97/src/ -O3 -DNDEBUG -std=c++11 -fPIC
|
||||
LDFLAGS =
|
||||
# OS specific
|
||||
# TODO: support Windows
|
||||
|
@ -187,7 +187,7 @@ clean:
|
|||
rm -f *.o main quantize
|
||||
|
||||
main: main.cpp ggml.o utils.o
|
||||
$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o /Users/billhamilton/src/sentencepiece/build/src/libsentencepiece.a -o main $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o deps/libsentencepiece.a -o main $(LDFLAGS)
|
||||
./main -h
|
||||
|
||||
quantize: quantize.cpp ggml.o utils.o
|
||||
|
|
|
@ -132,7 +132,7 @@ Here are the step for the LLaMA-7B model:
|
|||
# build this repo
|
||||
git clone https://github.com/ggerganov/llama.cpp
|
||||
cd llama.cpp
|
||||
make
|
||||
./build.sh
|
||||
|
||||
# obtain the original LLaMA model weights and place them in ./models
|
||||
ls ./models
|
||||
|
|
21
build.sh
Executable file
21
build.sh
Executable file
|
@ -0,0 +1,21 @@
|
|||
#!/bin/sh
|
||||
|
||||
if [ ! -d deps ]
|
||||
then
|
||||
mkdir deps
|
||||
fi
|
||||
cd deps
|
||||
if [ ! -f v0.1.97.tar.gz ]
|
||||
then
|
||||
curl -LO https://github.com/google/sentencepiece/archive/refs/tags/v0.1.97.tar.gz
|
||||
fi
|
||||
if [ ! -f libsentencepiece.a ]
|
||||
then
|
||||
tar xzvf v0.1.97.tar.gz
|
||||
cd sentencepiece-0.1.97/ && rm -rf build && mkdir build && cd build && cmake ..
|
||||
make sentencepiece-static -j $(nproc)
|
||||
cd ../..
|
||||
cp sentencepiece-0.1.97/build/src/libsentencepiece.a ./
|
||||
fi
|
||||
cd ..
|
||||
make
|
|
@ -1,12 +0,0 @@
|
|||
#https://github.com/google/sentencepiece.git
|
||||
#9ffb33a14c97c512103be0ee74740099660b39aa
|
||||
|
||||
curl -LO https://github.com/google/sentencepiece/releases/download/v0.1.97/sentencepiece-0.1.97.tar.gz
|
||||
tar xzvf sentencepiece-0.1.97.tar.gz
|
||||
cd sentencepiece-0.1.97/src
|
||||
mkdir build
|
||||
cd build
|
||||
cmake ..
|
||||
make sentencepiece-static -j $(nproc)
|
||||
cd ../..
|
||||
|
4
main.cpp
4
main.cpp
|
@ -855,6 +855,8 @@ int main(int argc, char ** argv) {
|
|||
printf("\n\n");
|
||||
|
||||
std::vector<gpt_vocab::id> embd;
|
||||
std::vector<gpt_vocab::id> all_tokens;
|
||||
std::string full_text = "";
|
||||
|
||||
// determine the required inference memory per token:
|
||||
size_t mem_per_token = 0;
|
||||
|
@ -920,6 +922,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
last_n_tokens.erase(last_n_tokens.begin());
|
||||
last_n_tokens.push_back(id);
|
||||
all_tokens.push_back(id);
|
||||
|
||||
t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
|
@ -938,6 +941,7 @@ int main(int argc, char ** argv) {
|
|||
embd.push_back(embd_inp[input_consumed]);
|
||||
last_n_tokens.erase(last_n_tokens.begin());
|
||||
last_n_tokens.push_back(embd_inp[input_consumed]);
|
||||
all_tokens.push_back(embd_inp[input_consumed]);
|
||||
++input_consumed;
|
||||
if (embd.size() > params.n_batch) {
|
||||
break;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue