llama2.c: escape whitespaces w/ U+2581 in vocab converter the llama.cpp way
This commit is contained in:
parent
20c44711bc
commit
0722e58ac2
1 changed files with 11 additions and 0 deletions
|
@ -10,6 +10,7 @@
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
#include <random>
|
#include <random>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
|
#include <sstream>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
|
@ -564,6 +565,15 @@ bool is_ggml_file(const char *filename) {
|
||||||
return magic == GGUF_MAGIC;
|
return magic == GGUF_MAGIC;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::string llama_escape_whitespaces(const std::string& text) {
|
||||||
|
std::ostringstream out;
|
||||||
|
for (char c : text) {
|
||||||
|
if (c == ' ') out << "\xe2\x96\x81";
|
||||||
|
else out << c;
|
||||||
|
}
|
||||||
|
return out.str();
|
||||||
|
}
|
||||||
|
|
||||||
void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
|
void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
|
||||||
if (is_ggml_file(filename)) {
|
if (is_ggml_file(filename)) {
|
||||||
struct ggml_context * ctx_data = NULL;
|
struct ggml_context * ctx_data = NULL;
|
||||||
|
@ -639,6 +649,7 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
|
||||||
} else {
|
} else {
|
||||||
type = LLAMA_TOKEN_TYPE_NORMAL;
|
type = LLAMA_TOKEN_TYPE_NORMAL;
|
||||||
}
|
}
|
||||||
|
text = llama_escape_whitespaces(text);
|
||||||
|
|
||||||
vocab->id_to_token[id].text = text;
|
vocab->id_to_token[id].text = text;
|
||||||
vocab->id_to_token[id].score = score;
|
vocab->id_to_token[id].score = score;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue