Remove Unprintable
Fixes #11 This fixes a Japanese prompt I was attempting to run EG: `./main -m ./models/13B/ggml-model-q4_0.bin -t 8 -n 128 -n 512 -p $'人生の意味は'` Output before change: `人生の意���、フロントカードに���いてる。 2019年3月 © All Rights Reserved. [end of text]` So it is outputting some characters but some � Output after change: `人生の意は、一人が一人ということであります。は安部が立していたので、去からは一人の人にれるのはにとどまったのですが、そう`
This commit is contained in:
parent
4235e3d5b3
commit
4726e671e6
1 changed files with 8 additions and 0 deletions
8
main.cpp
8
main.cpp
|
@ -10,6 +10,7 @@
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <unordered_set>
|
||||||
|
|
||||||
// determine number of model parts based on the dimension
|
// determine number of model parts based on the dimension
|
||||||
static const std::map<int, int> LLAMA_N_PARTS = {
|
static const std::map<int, int> LLAMA_N_PARTS = {
|
||||||
|
@ -123,6 +124,9 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
||||||
}
|
}
|
||||||
|
|
||||||
// load vocab
|
// load vocab
|
||||||
|
|
||||||
|
std::unordered_set<std::string> unprintable_characters = {"", "<EFBFBD>", "<EFBFBD><EFBFBD>"};
|
||||||
|
|
||||||
{
|
{
|
||||||
const int32_t n_vocab = model.hparams.n_vocab;
|
const int32_t n_vocab = model.hparams.n_vocab;
|
||||||
|
|
||||||
|
@ -140,6 +144,10 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
||||||
word.resize(len);
|
word.resize(len);
|
||||||
fin.read((char *) word.data(), len);
|
fin.read((char *) word.data(), len);
|
||||||
|
|
||||||
|
if(unprintable_characters.find(word) != unprintable_characters.end()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
vocab.token_to_id[word] = i;
|
vocab.token_to_id[word] = i;
|
||||||
vocab.id_to_token[i] = word;
|
vocab.id_to_token[i] = word;
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue