add support to load tokenizer.model from command line argument

2023-03-13 18:01:05 +08:00 · 2023-03-13 18:01:05 +08:00 · 6b9e424671
commit 6b9e424671
parent 7438b83939
3 changed files with 14 additions and 36 deletions
--- a/main.cpp
+++ b/main.cpp
@ -776,14 +776,15 @@ int main(int argc, char ** argv) {

    gpt_params params;
    params.model = "models/llama-7B/ggml-model.bin";
-
-    sentencepiece::SentencePieceProcessor sp;
-    sp.Load("./models/tokenizer.model");
+    params.tokenizer = "models/tokenizer.model";

    if (gpt_params_parse(argc, argv, params) == false) {
        return 1;
    }

+    sentencepiece::SentencePieceProcessor sp;
+    sp.Load(params.tokenizer);
+
    if (params.seed < 0) {
        params.seed = time(NULL);
    }
@ -823,12 +824,12 @@ int main(int argc, char ** argv) {
    std::vector<float> logits;

    // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
+    std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(sp, vocab, params.prompt, true);

    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());

    // tokenize the reverse prompt
-    std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false);
+    std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(sp, vocab, params.antiprompt, false);

    printf("\n");
    printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
@ -999,7 +1000,7 @@ int main(int argc, char ** argv) {
                        buf[n_read+1] = 0;
                    }

-                    std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(vocab, buf, false);
+                    std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(sp, vocab, buf, false);
                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());

                    remaining_tokens -= line_inp.size();
--- a/utils.cpp
+++ b/utils.cpp
@ -51,6 +51,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.n_batch = std::stoi(argv[++i]);
        } else if (arg == "-m" || arg == "--model") {
            params.model = argv[++i];
+        } else if (arg == "--tokenizer") {
+            params.tokenizer = argv[++i];
        } else if (arg == "-i" || arg == "--interactive") {
            params.interactive = true;
        } else if (arg == "--interactive-start") {
@ -98,6 +100,8 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params) {
    fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
    fprintf(stderr, "  -m FNAME, --model FNAME\n");
    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
+    fprintf(stderr, "  --tokenizer FNAME\n");
+    fprintf(stderr, "                        tokenizer path (default: %s)\n", params.model.c_str());
    fprintf(stderr, "\n");
 }

@ -274,39 +278,11 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
    return tokens;
 }

-std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {
-    //auto res = gpt_tokenize(vocab, text);
-
-    //if (bos) {
-    //    res.insert(res.begin(), 1); // TODO: replace with vocab.bos
-    //}
-
+std::vector<gpt_vocab::id> llama_tokenize(sentencepiece::SentencePieceProcessor & sp, const gpt_vocab & vocab, const std::string & text, bool bos) {
    std::vector<gpt_vocab::id> res;

-    // if (bos) {
-    //     res.push_back(1); // TODO: replace with vocab.bos
-    // }
-
-    sentencepiece::SentencePieceProcessor sp;
-    sp.Load("./models/tokenizer.model");
-
    std::vector<std::string> pieces;
    return sp.EncodeAsIds(text);
-/*
-    for (const auto & piece : pieces) {
-        printf("piece: %s\n", piece.c_str());
-        if (vocab.token_to_id.count(piece) > 0) {
-            res.push_back(vocab.token_to_id.at(piece));
-        } else {
-            // handle unknown token
-        }
-    }
-
-    for (const auto& id : res) {
-        printf("%d\n", id);
-    }
-
-    return res;*/
 }

 bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
--- a/utils.h
+++ b/utils.h
@ -29,6 +29,7 @@ struct gpt_params {
    int32_t n_batch = 8; // batch size for prompt processing

    std::string model = "models/lamma-7B/ggml-model.bin"; // model path
+    std::string tokenizer = "models/tokenizer.model";   // tokenizer path
    std::string prompt;

    bool use_color = false; // use color to distinguish generations and inputs
@ -75,7 +76,7 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri

 // TODO: this is probably wrong, but I cannot figure out how this tokenizer works ..
 // ref: https://github.com/google/sentencepiece
-std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos);
+std::vector<gpt_vocab::id> llama_tokenize(sentencepiece::SentencePieceProcessor & sp, const gpt_vocab & vocab, const std::string & text, bool bos);

 // load the tokens from encoder.json
 bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);