add support to load tokenizer.model from command line argument

This commit is contained in:
wizard 2023-03-13 18:01:05 +08:00
parent 7438b83939
commit 6b9e424671
3 changed files with 14 additions and 36 deletions

View file

@ -776,14 +776,15 @@ int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
params.model = "models/llama-7B/ggml-model.bin"; params.model = "models/llama-7B/ggml-model.bin";
params.tokenizer = "models/tokenizer.model";
sentencepiece::SentencePieceProcessor sp;
sp.Load("./models/tokenizer.model");
if (gpt_params_parse(argc, argv, params) == false) { if (gpt_params_parse(argc, argv, params) == false) {
return 1; return 1;
} }
sentencepiece::SentencePieceProcessor sp;
sp.Load(params.tokenizer);
if (params.seed < 0) { if (params.seed < 0) {
params.seed = time(NULL); params.seed = time(NULL);
} }
@ -823,12 +824,12 @@ int main(int argc, char ** argv) {
std::vector<float> logits; std::vector<float> logits;
// tokenize the prompt // tokenize the prompt
std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true); std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(sp, vocab, params.prompt, true);
params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
// tokenize the reverse prompt // tokenize the reverse prompt
std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false); std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(sp, vocab, params.antiprompt, false);
printf("\n"); printf("\n");
printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
@ -999,7 +1000,7 @@ int main(int argc, char ** argv) {
buf[n_read+1] = 0; buf[n_read+1] = 0;
} }
std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(vocab, buf, false); std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(sp, vocab, buf, false);
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
remaining_tokens -= line_inp.size(); remaining_tokens -= line_inp.size();

View file

@ -51,6 +51,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.n_batch = std::stoi(argv[++i]); params.n_batch = std::stoi(argv[++i]);
} else if (arg == "-m" || arg == "--model") { } else if (arg == "-m" || arg == "--model") {
params.model = argv[++i]; params.model = argv[++i];
} else if (arg == "--tokenizer") {
params.tokenizer = argv[++i];
} else if (arg == "-i" || arg == "--interactive") { } else if (arg == "-i" || arg == "--interactive") {
params.interactive = true; params.interactive = true;
} else if (arg == "--interactive-start") { } else if (arg == "--interactive-start") {
@ -98,6 +100,8 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params) {
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stderr, " -m FNAME, --model FNAME\n"); fprintf(stderr, " -m FNAME, --model FNAME\n");
fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
fprintf(stderr, " --tokenizer FNAME\n");
fprintf(stderr, " tokenizer path (default: %s)\n", params.model.c_str());
fprintf(stderr, "\n"); fprintf(stderr, "\n");
} }
@ -274,39 +278,11 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
return tokens; return tokens;
} }
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) { std::vector<gpt_vocab::id> llama_tokenize(sentencepiece::SentencePieceProcessor & sp, const gpt_vocab & vocab, const std::string & text, bool bos) {
//auto res = gpt_tokenize(vocab, text);
//if (bos) {
// res.insert(res.begin(), 1); // TODO: replace with vocab.bos
//}
std::vector<gpt_vocab::id> res; std::vector<gpt_vocab::id> res;
// if (bos) {
// res.push_back(1); // TODO: replace with vocab.bos
// }
sentencepiece::SentencePieceProcessor sp;
sp.Load("./models/tokenizer.model");
std::vector<std::string> pieces; std::vector<std::string> pieces;
return sp.EncodeAsIds(text); return sp.EncodeAsIds(text);
/*
for (const auto & piece : pieces) {
printf("piece: %s\n", piece.c_str());
if (vocab.token_to_id.count(piece) > 0) {
res.push_back(vocab.token_to_id.at(piece));
} else {
// handle unknown token
}
}
for (const auto& id : res) {
printf("%d\n", id);
}
return res;*/
} }
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) { bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {

View file

@ -29,6 +29,7 @@ struct gpt_params {
int32_t n_batch = 8; // batch size for prompt processing int32_t n_batch = 8; // batch size for prompt processing
std::string model = "models/lamma-7B/ggml-model.bin"; // model path std::string model = "models/lamma-7B/ggml-model.bin"; // model path
std::string tokenizer = "models/tokenizer.model"; // tokenizer path
std::string prompt; std::string prompt;
bool use_color = false; // use color to distinguish generations and inputs bool use_color = false; // use color to distinguish generations and inputs
@ -75,7 +76,7 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
// TODO: this is probably wrong, but I cannot figure out how this tokenizer works .. // TODO: this is probably wrong, but I cannot figure out how this tokenizer works ..
// ref: https://github.com/google/sentencepiece // ref: https://github.com/google/sentencepiece
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos); std::vector<gpt_vocab::id> llama_tokenize(sentencepiece::SentencePieceProcessor & sp, const gpt_vocab & vocab, const std::string & text, bool bos);
// load the tokens from encoder.json // load the tokens from encoder.json
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab); bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);