Added whitespace escaping and unescaping

Now we see some resemblence to the Meta-Tokenizer, I think. Only problem: how to integrate this into `llama.cpp` kernel.
This commit is contained in:
goerch 2023-07-22 22:24:21 +02:00
parent 94a0ee1eb8
commit 0e74a7222e
4 changed files with 94 additions and 35 deletions

View file

@ -233,12 +233,7 @@ class SentencePieceVocab:
for i in range(tokenizer.vocab_size()):
# TODO: How do we want to support is_unknown, is_control, is_byte and is_unused?
piece = tokenizer.id_to_piece(i)
text: bytes
if tokenizer.is_unknown(i) or tokenizer.is_control(i) or tokenizer.is_byte(i):
text: bytes = piece.encode("utf-8")
else:
text = piece.replace("\u2581", " ").encode("utf-8")
text: bytes = piece.encode("utf-8")
score: float = tokenizer.get_score(i)
yield text, score

View file

@ -1832,13 +1832,13 @@ struct llama_tokenizer {
llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {}
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
// split string into utf8 chars
// split string into utf8 chars / token?
int index = 0;
size_t offs = 0;
while (offs < text.size()) {
llama_sp_symbol sym;
// size_t len = utf8_len(text[offs]);
size_t len = llama_trie_find(vocab_.trie, text, offs);
size_t len = utf8_len(text[offs]);
// size_t len = llama_trie_find(vocab_.trie, text, offs);
if (len == 0) {
len = utf8_len(text[offs]);
}

View file

@ -5,13 +5,44 @@
#include <map>
#include <vector>
std::string detokenize(llama_context * ctx, const llama_token * tokens, int count) {
static std::string escape_whitespace(const std::string& text) {
std::string result;
bool escaping = false;
result += char(0xe2);
result += char(0x96);
result += char(0x81);
for (size_t offs = 0; offs < text.length(); ++offs) {
if (text[offs] == ' ' || text[offs] == '\t' || text[offs] == '\n') {
if (!escaping) {
result += char(0xe2);
result += char(0x96);
result += char(0x81);
escaping = true;
}
}
else {
escaping = false;
result += text[offs];
}
}
return result;
}
static std::string unescape_whitespace(llama_context* ctx, llama_token token) {
const char* word = llama_token_to_str(ctx, token);
if (strlen(word) >= 3 &&
word[0] == char(0xe2) &&
word[1] == char(0x96) &&
word[2] == char(0x81)) {
return std::string(" ") + (word + 3);
}
return word;
}
static std::string unescape_whitespace(llama_context* ctx, const llama_token* tokens, int count) {
std::string result;
for (int i = 0; i < count; ++i) {
result += llama_token_to_str(ctx, tokens[i]);
if (i < count - 1) {
result += "_";
}
result += unescape_whitespace(ctx, tokens[i]);
}
return result;
}
@ -19,12 +50,14 @@ std::string detokenize(llama_context * ctx, const llama_token * tokens, int coun
static const std::map<std::string, std::vector<llama_token>> & k_tests()
{
static std::map<std::string, std::vector<llama_token>> _k_tests = {
{ "Hello World", { 1, 10994, 2787, }, },
{ " Hello World", { 1, 15043, 2787, }, },
{ " Hello World!", { 1, 15043, 2787, 29991, }, },
{ " this is 🦙.cpp", { 1, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
{ "w048 7tuijk dsdfhu", { 1, 29893, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
{ "нещо на Български", { 1, 821, 4851, 665, 1386, 29713, 1305, }, },
{ "Hello world", { 1, 15043, 3186, }, },
{ " Hello world", { 1, 29871, 15043, 3186, }, },
{ "Hello World", { 1, 15043, 2787, }, },
{ " Hello World", { 1, 29871, 15043, 2787, }, },
{" Hello World!", { 1, 29871, 15043, 2787, 29991, }, },
{" this is 🦙.cpp", { 1, 29871, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
{"w048 7tuijk dsdfhu", { 1, 281, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
{"нещо на Български", { 1, 1538, 4851, 665, 1386, 29713, 1305, }, },
};
return _k_tests;
};
@ -77,9 +110,9 @@ int main(int argc, char **argv) {
for (const auto & test_kv : k_tests()) {
std::vector<llama_token> res(test_kv.first.size());
const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), int(res.size()), true);
const int n = llama_tokenize(ctx, escape_whitespace(test_kv.first.c_str()).c_str(), res.data(), int(res.size()), true);
fprintf(stderr, "%s : '%s' tokenized to '%s'\n",
__func__, test_kv.first.c_str(), detokenize(ctx, res.data(), n).c_str());
__func__, test_kv.first.c_str(), unescape_whitespace(ctx, res.data(), n).c_str());
res.resize(n);
bool correct = res.size() == test_kv.second.size();

View file

@ -8,13 +8,44 @@
#include <map>
#include <vector>
std::string detokenize(llama_context * ctx, const llama_token * tokens, int count) {
static std::string escape_whitespace(const std::string& text) {
std::string result;
bool escaping = false;
result += char(0xe2);
result += char(0x96);
result += char(0x81);
for (size_t offs = 0; offs < text.length(); ++offs) {
if (text[offs] == ' ' || text[offs] == '\t' || text[offs] == '\n') {
if (!escaping) {
result += char(0xe2);
result += char(0x96);
result += char(0x81);
escaping = true;
}
}
else {
escaping = false;
result += text[offs];
}
}
return result;
}
static std::string unescape_whitespace(llama_context* ctx, llama_token token) {
const char* word = llama_token_to_str(ctx, token);
if (strlen(word) >= 3 &&
word[0] == char(0xe2) &&
word[1] == char(0x96) &&
word[2] == char(0x81)) {
return std::string(" ") + (word + 3);
}
return word;
}
static std::string unescape_whitespace(llama_context* ctx, const llama_token* tokens, int count) {
std::string result;
for (int i = 0; i < count; ++i) {
result += llama_token_to_str(ctx, tokens[i]);
if (i < count - 1) {
result += "_";
}
result += unescape_whitespace(ctx, tokens[i]);
}
return result;
}
@ -66,22 +97,22 @@ int main(int argc, char **argv) {
}
for (int i = 0; i < n_vocab; ++i) {
const char * forward = llama_token_to_str(ctx, i);
std::vector<llama_token> tokens(strlen(forward));
auto n = llama_tokenize(ctx, forward, tokens.data(), strlen(forward), false);
std::string forward = llama_token_to_str(ctx, i);
std::vector<llama_token> tokens(forward.length());
int n = llama_tokenize(ctx, forward.c_str(), tokens.data(), forward.length(), false);
if (n == 1) {
if (i != tokens[0]) {
const char* backward = llama_token_to_str(ctx, tokens[0]);
std::string backward = unescape_whitespace(ctx, tokens[0]);
fprintf(stderr, "%s : error: token %d is string %s but tokenize() returns token %d %s\n",
__func__, i, forward, tokens[0], backward);
__func__, i, unescape_whitespace(ctx, i).c_str(), tokens[0], backward.c_str());
}
} else {
if (i <= 258) {
fprintf(stderr, "%s : info: token %d is string %s and tokenize() returns tokens %s\n",
__func__, i, forward, detokenize(ctx, tokens.data(), n).c_str());
__func__, i, unescape_whitespace(ctx, i).c_str(), unescape_whitespace(ctx, tokens.data(), n).c_str());
} else {
fprintf(stderr, "%s : error: token %d is string %s but tokenize() returns tokens %s\n",
__func__, i, forward, detokenize(ctx, tokens.data(), n).c_str());
__func__, i, unescape_whitespace(ctx, i).c_str(), unescape_whitespace(ctx, tokens.data(), n).c_str());
}
}
}
@ -91,7 +122,7 @@ int main(int argc, char **argv) {
std::wstring wstr(1, ch);
std::string str = converter.to_bytes(wstr);
std::vector<llama_token> tokens(strlen(str.c_str()));
auto n = llama_tokenize(ctx, str.c_str(), tokens.data(), str.length(), false);
auto n = llama_tokenize(ctx, escape_whitespace(str).c_str(), tokens.data(), str.length(), false);
if (n == 1) {
fprintf(stderr, "%s : info: %s tokenized to %d \n",
__func__, str.c_str(), tokens[0]);