Remove Unprintable

Fixes #11 This fixes a Japanese prompt I was attempting to run EG: `./main -m ./models/13B/ggml-model-q4_0.bin -t 8 -n 128 -n 512 -p $'人生の意味は'` Output before change: `人生の意��、フロントカードに��いてる。 2019年3月　© All Rights Reserved. [end of text]` So it is outputting some characters but some � Output after change: `人生の意は、一人が一人ということであります。は安部が立していたので、去からは一人の人にれるのはにとどまったのですが、そう`
2023-03-11 17:13:28 -05:00 · 2023-03-11 17:13:28 -05:00 · 4726e671e6
commit 4726e671e6
parent 4235e3d5b3
1 changed files with 8 additions and 0 deletions
--- a/main.cpp
+++ b/main.cpp
@ -10,6 +10,7 @@
 #include <map>
 #include <string>
 #include <vector>
+#include <unordered_set>

 // determine number of model parts based on the dimension
 static const std::map<int, int> LLAMA_N_PARTS = {
@ -123,6 +124,9 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
    }

    // load vocab
+
+    std::unordered_set<std::string> unprintable_characters = {"", "<EFBFBD>", "<EFBFBD><EFBFBD>"};
+
    {
        const int32_t n_vocab = model.hparams.n_vocab;

@ -140,6 +144,10 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
            word.resize(len);
            fin.read((char *) word.data(), len);

+            if(unprintable_characters.find(word) != unprintable_characters.end()) {
+                continue;
+            }
+
            vocab.token_to_id[word] = i;
            vocab.id_to_token[i] = word;