From 0722e58ac26e07065458bbdcf8e2f0ee8b8a113a Mon Sep 17 00:00:00 2001 From: ochafik Date: Sat, 26 Aug 2023 22:43:00 +0100 Subject: [PATCH] llama2.c: escape whitespaces w/ U+2581 in vocab converter the llama.cpp way --- .../convert-llama2c-to-ggml.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index 859728627..701823136 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -564,6 +565,15 @@ bool is_ggml_file(const char *filename) { return magic == GGUF_MAGIC; } +static std::string llama_escape_whitespaces(const std::string& text) { + std::ostringstream out; + for (char c : text) { + if (c == ' ') out << "\xe2\x96\x81"; + else out << c; + } + return out.str(); +} + void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) { if (is_ggml_file(filename)) { struct ggml_context * ctx_data = NULL; @@ -639,6 +649,7 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) } else { type = LLAMA_TOKEN_TYPE_NORMAL; } + text = llama_escape_whitespaces(text); vocab->id_to_token[id].text = text; vocab->id_to_token[id].score = score;