From 6b9e424671e81c45394dcaca0bc04ee74d15aa82 Mon Sep 17 00:00:00 2001 From: wizard Date: Mon, 13 Mar 2023 18:01:05 +0800 Subject: [PATCH] add support to load tokenizer.model from command line argument --- main.cpp | 13 +++++++------ utils.cpp | 34 +++++----------------------------- utils.h | 3 ++- 3 files changed, 14 insertions(+), 36 deletions(-) diff --git a/main.cpp b/main.cpp index 34d625891..dd1c9c8d2 100644 --- a/main.cpp +++ b/main.cpp @@ -776,14 +776,15 @@ int main(int argc, char ** argv) { gpt_params params; params.model = "models/llama-7B/ggml-model.bin"; - - sentencepiece::SentencePieceProcessor sp; - sp.Load("./models/tokenizer.model"); + params.tokenizer = "models/tokenizer.model"; if (gpt_params_parse(argc, argv, params) == false) { return 1; } + sentencepiece::SentencePieceProcessor sp; + sp.Load(params.tokenizer); + if (params.seed < 0) { params.seed = time(NULL); } @@ -823,12 +824,12 @@ int main(int argc, char ** argv) { std::vector logits; // tokenize the prompt - std::vector embd_inp = ::llama_tokenize(vocab, params.prompt, true); + std::vector embd_inp = ::llama_tokenize(sp, vocab, params.prompt, true); params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); // tokenize the reverse prompt - std::vector antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false); + std::vector antiprompt_inp = ::llama_tokenize(sp, vocab, params.antiprompt, false); printf("\n"); printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); @@ -999,7 +1000,7 @@ int main(int argc, char ** argv) { buf[n_read+1] = 0; } - std::vector line_inp = ::llama_tokenize(vocab, buf, false); + std::vector line_inp = ::llama_tokenize(sp, vocab, buf, false); embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); remaining_tokens -= line_inp.size(); diff --git a/utils.cpp b/utils.cpp index 36cb95e98..bd449bdce 100644 --- a/utils.cpp +++ b/utils.cpp @@ -51,6 +51,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.n_batch = std::stoi(argv[++i]); } else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; + } else if (arg == "--tokenizer") { + params.tokenizer = argv[++i]; } else if (arg == "-i" || arg == "--interactive") { params.interactive = true; } else if (arg == "--interactive-start") { @@ -98,6 +100,8 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params) { fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stderr, " -m FNAME, --model FNAME\n"); fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); + fprintf(stderr, " --tokenizer FNAME\n"); + fprintf(stderr, " tokenizer path (default: %s)\n", params.model.c_str()); fprintf(stderr, "\n"); } @@ -274,39 +278,11 @@ std::vector gpt_tokenize(const gpt_vocab & vocab, const std::stri return tokens; } -std::vector llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) { - //auto res = gpt_tokenize(vocab, text); - - //if (bos) { - // res.insert(res.begin(), 1); // TODO: replace with vocab.bos - //} - +std::vector llama_tokenize(sentencepiece::SentencePieceProcessor & sp, const gpt_vocab & vocab, const std::string & text, bool bos) { std::vector res; - // if (bos) { - // res.push_back(1); // TODO: replace with vocab.bos - // } - - sentencepiece::SentencePieceProcessor sp; - sp.Load("./models/tokenizer.model"); - std::vector pieces; return sp.EncodeAsIds(text); -/* - for (const auto & piece : pieces) { - printf("piece: %s\n", piece.c_str()); - if (vocab.token_to_id.count(piece) > 0) { - res.push_back(vocab.token_to_id.at(piece)); - } else { - // handle unknown token - } - } - - for (const auto& id : res) { - printf("%d\n", id); - } - - return res;*/ } bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) { diff --git a/utils.h b/utils.h index 07f1c8faf..0c8ba8c21 100644 --- a/utils.h +++ b/utils.h @@ -29,6 +29,7 @@ struct gpt_params { int32_t n_batch = 8; // batch size for prompt processing std::string model = "models/lamma-7B/ggml-model.bin"; // model path + std::string tokenizer = "models/tokenizer.model"; // tokenizer path std::string prompt; bool use_color = false; // use color to distinguish generations and inputs @@ -75,7 +76,7 @@ std::vector gpt_tokenize(const gpt_vocab & vocab, const std::stri // TODO: this is probably wrong, but I cannot figure out how this tokenizer works .. // ref: https://github.com/google/sentencepiece -std::vector llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos); +std::vector llama_tokenize(sentencepiece::SentencePieceProcessor & sp, const gpt_vocab & vocab, const std::string & text, bool bos); // load the tokens from encoder.json bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);