work towards tokenizer integration
This commit is contained in:
parent
c80e2a8f2a
commit
96dc6a0c68
3 changed files with 41 additions and 12 deletions
7
Makefile
7
Makefile
|
@ -31,9 +31,8 @@ endif
|
|||
#
|
||||
|
||||
CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
|
||||
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
|
||||
LDFLAGS =
|
||||
|
||||
CXXFLAGS = -I. -I../../sentencepiece/src/ -O3 -DNDEBUG -std=c++11 -fPIC
|
||||
LDFLAGS =
|
||||
# OS specific
|
||||
# TODO: support Windows
|
||||
ifeq ($(UNAME_S),Linux)
|
||||
|
@ -188,7 +187,7 @@ clean:
|
|||
rm -f *.o main quantize
|
||||
|
||||
main: main.cpp ggml.o utils.o
|
||||
$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o -o main $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o /Users/billhamilton/src/sentencepiece/build/src/libsentencepiece.a -o main $(LDFLAGS)
|
||||
./main -h
|
||||
|
||||
quantize: quantize.cpp ggml.o utils.o
|
||||
|
|
12
build_deps.sh
Normal file
12
build_deps.sh
Normal file
|
@ -0,0 +1,12 @@
|
|||
#https://github.com/google/sentencepiece.git
|
||||
#9ffb33a14c97c512103be0ee74740099660b39aa
|
||||
|
||||
curl -LO https://github.com/google/sentencepiece/releases/download/v0.1.97/sentencepiece-0.1.97.tar.gz
|
||||
tar xzvf sentencepiece-0.1.97.tar.gz
|
||||
cd sentencepiece-0.1.97/src
|
||||
mkdir build
|
||||
cd build
|
||||
cmake ..
|
||||
make sentencepiece-static -j $(nproc)
|
||||
cd ../..
|
||||
|
34
main.cpp
34
main.cpp
|
@ -14,6 +14,12 @@
|
|||
#include <signal.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <sentencepiece_processor.h>
|
||||
|
||||
|
||||
//Tokenizer object
|
||||
sentencepiece::SentencePieceProcessor processor;
|
||||
|
||||
#define ANSI_COLOR_RED "\x1b[31m"
|
||||
#define ANSI_COLOR_GREEN "\x1b[32m"
|
||||
#define ANSI_COLOR_YELLOW "\x1b[33m"
|
||||
|
@ -758,6 +764,11 @@ void sigint_handler(int signo) {
|
|||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
const auto status = processor.Load("models/tokenizer.model");
|
||||
if (!status.ok()) {
|
||||
printf("%s", status.ToString().c_str());
|
||||
// error
|
||||
}
|
||||
ggml_time_init();
|
||||
const int64_t t_main_start_us = ggml_time_us();
|
||||
|
||||
|
@ -807,7 +818,8 @@ int main(int argc, char ** argv) {
|
|||
std::vector<float> logits;
|
||||
|
||||
// tokenize the prompt
|
||||
std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
|
||||
std::vector<gpt_vocab::id> embd_inp;
|
||||
processor.Encode(params.prompt, &embd_inp);
|
||||
|
||||
params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
|
||||
|
||||
|
@ -935,14 +947,20 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// display text
|
||||
if (!input_noecho) {
|
||||
for (auto id : embd) {
|
||||
printf("%s", vocab.id_to_token[id].c_str());
|
||||
std::string check = processor.IdToPiece(all_tokens.at(all_tokens.size()-1));
|
||||
if(check != "<EFBFBD>") { // ensure a multi-byte token is finished generating before outputting the text
|
||||
std::string text;
|
||||
processor.Decode(all_tokens, &text);
|
||||
std::string chunk = text.substr(full_text.length());
|
||||
printf("%s", chunk.c_str());
|
||||
full_text += chunk;
|
||||
|
||||
// reset color to default if we there is no pending user input
|
||||
if (params.use_color && embd_inp.size() <= input_consumed) {
|
||||
printf(ANSI_COLOR_RESET);
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
// reset color to default if we there is no pending user input
|
||||
if (params.use_color && embd_inp.size() <= input_consumed) {
|
||||
printf(ANSI_COLOR_RESET);
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
// in interactive mode, and not currently processing queued inputs;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue