Remove Unprintable
Fixes #11 This fixes a Japanese prompt I was attempting to run EG: `./main -m ./models/13B/ggml-model-q4_0.bin -t 8 -n 128 -n 512 -p $'人生の意味は'` Output before change: `人生の意���、フロントカードに���いてる。 2019年3月 © All Rights Reserved. [end of text]` So it is outputting some characters but some � Output after change: `人生の意は、一人が一人ということであります。は安部が立していたので、去からは一人の人にれるのはにとどまったのですが、そう`
This commit is contained in:
parent
4235e3d5b3
commit
4726e671e6
1 changed files with 8 additions and 0 deletions
8
main.cpp
8
main.cpp
|
@ -10,6 +10,7 @@
|
|||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <unordered_set>
|
||||
|
||||
// determine number of model parts based on the dimension
|
||||
static const std::map<int, int> LLAMA_N_PARTS = {
|
||||
|
@ -123,6 +124,9 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
|||
}
|
||||
|
||||
// load vocab
|
||||
|
||||
std::unordered_set<std::string> unprintable_characters = {"", "<EFBFBD>", "<EFBFBD><EFBFBD>"};
|
||||
|
||||
{
|
||||
const int32_t n_vocab = model.hparams.n_vocab;
|
||||
|
||||
|
@ -140,6 +144,10 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
|||
word.resize(len);
|
||||
fin.read((char *) word.data(), len);
|
||||
|
||||
if(unprintable_characters.find(word) != unprintable_characters.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
vocab.token_to_id[word] = i;
|
||||
vocab.id_to_token[i] = word;
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue