add support to load tokenizer.model from command line argument

This commit is contained in:
wizard 2023-03-13 18:01:05 +08:00
parent 7438b83939
commit 6b9e424671
3 changed files with 14 additions and 36 deletions

View file

@ -776,14 +776,15 @@ int main(int argc, char ** argv) {
gpt_params params;
params.model = "models/llama-7B/ggml-model.bin";
sentencepiece::SentencePieceProcessor sp;
sp.Load("./models/tokenizer.model");
params.tokenizer = "models/tokenizer.model";
if (gpt_params_parse(argc, argv, params) == false) {
return 1;
}
sentencepiece::SentencePieceProcessor sp;
sp.Load(params.tokenizer);
if (params.seed < 0) {
params.seed = time(NULL);
}
@ -823,12 +824,12 @@ int main(int argc, char ** argv) {
std::vector<float> logits;
// tokenize the prompt
std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(sp, vocab, params.prompt, true);
params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
// tokenize the reverse prompt
std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false);
std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(sp, vocab, params.antiprompt, false);
printf("\n");
printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
@ -999,7 +1000,7 @@ int main(int argc, char ** argv) {
buf[n_read+1] = 0;
}
std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(vocab, buf, false);
std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(sp, vocab, buf, false);
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
remaining_tokens -= line_inp.size();

View file

@ -51,6 +51,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.n_batch = std::stoi(argv[++i]);
} else if (arg == "-m" || arg == "--model") {
params.model = argv[++i];
} else if (arg == "--tokenizer") {
params.tokenizer = argv[++i];
} else if (arg == "-i" || arg == "--interactive") {
params.interactive = true;
} else if (arg == "--interactive-start") {
@ -98,6 +100,8 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params) {
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stderr, " -m FNAME, --model FNAME\n");
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
fprintf(stderr, " --tokenizer FNAME\n");
fprintf(stderr, " tokenizer path (default: %s)\n", params.model.c_str());
fprintf(stderr, "\n");
}
@ -274,39 +278,11 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
return tokens;
}
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {
//auto res = gpt_tokenize(vocab, text);
//if (bos) {
// res.insert(res.begin(), 1); // TODO: replace with vocab.bos
//}
std::vector<gpt_vocab::id> llama_tokenize(sentencepiece::SentencePieceProcessor & sp, const gpt_vocab & vocab, const std::string & text, bool bos) {
std::vector<gpt_vocab::id> res;
// if (bos) {
// res.push_back(1); // TODO: replace with vocab.bos
// }
sentencepiece::SentencePieceProcessor sp;
sp.Load("./models/tokenizer.model");
std::vector<std::string> pieces;
return sp.EncodeAsIds(text);
/*
for (const auto & piece : pieces) {
printf("piece: %s\n", piece.c_str());
if (vocab.token_to_id.count(piece) > 0) {
res.push_back(vocab.token_to_id.at(piece));
} else {
// handle unknown token
}
}
for (const auto& id : res) {
printf("%d\n", id);
}
return res;*/
}
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {

View file

@ -29,6 +29,7 @@ struct gpt_params {
int32_t n_batch = 8; // batch size for prompt processing
std::string model = "models/lamma-7B/ggml-model.bin"; // model path
std::string tokenizer = "models/tokenizer.model"; // tokenizer path
std::string prompt;
bool use_color = false; // use color to distinguish generations and inputs
@ -75,7 +76,7 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
// TODO: this is probably wrong, but I cannot figure out how this tokenizer works ..
// ref: https://github.com/google/sentencepiece
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos);
std::vector<gpt_vocab::id> llama_tokenize(sentencepiece::SentencePieceProcessor & sp, const gpt_vocab & vocab, const std::string & text, bool bos);
// load the tokens from encoder.json
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);