Added whitespace escaping and unescaping
Now we see some resemblence to the Meta-Tokenizer, I think. Only problem: how to integrate this into `llama.cpp` kernel.
This commit is contained in:
parent
94a0ee1eb8
commit
0e74a7222e
4 changed files with 94 additions and 35 deletions
|
@ -233,12 +233,7 @@ class SentencePieceVocab:
|
||||||
for i in range(tokenizer.vocab_size()):
|
for i in range(tokenizer.vocab_size()):
|
||||||
# TODO: How do we want to support is_unknown, is_control, is_byte and is_unused?
|
# TODO: How do we want to support is_unknown, is_control, is_byte and is_unused?
|
||||||
piece = tokenizer.id_to_piece(i)
|
piece = tokenizer.id_to_piece(i)
|
||||||
text: bytes
|
text: bytes = piece.encode("utf-8")
|
||||||
if tokenizer.is_unknown(i) or tokenizer.is_control(i) or tokenizer.is_byte(i):
|
|
||||||
text: bytes = piece.encode("utf-8")
|
|
||||||
else:
|
|
||||||
text = piece.replace("\u2581", " ").encode("utf-8")
|
|
||||||
|
|
||||||
score: float = tokenizer.get_score(i)
|
score: float = tokenizer.get_score(i)
|
||||||
yield text, score
|
yield text, score
|
||||||
|
|
||||||
|
|
|
@ -1832,13 +1832,13 @@ struct llama_tokenizer {
|
||||||
llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {}
|
llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {}
|
||||||
|
|
||||||
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
||||||
// split string into utf8 chars
|
// split string into utf8 chars / token?
|
||||||
int index = 0;
|
int index = 0;
|
||||||
size_t offs = 0;
|
size_t offs = 0;
|
||||||
while (offs < text.size()) {
|
while (offs < text.size()) {
|
||||||
llama_sp_symbol sym;
|
llama_sp_symbol sym;
|
||||||
// size_t len = utf8_len(text[offs]);
|
size_t len = utf8_len(text[offs]);
|
||||||
size_t len = llama_trie_find(vocab_.trie, text, offs);
|
// size_t len = llama_trie_find(vocab_.trie, text, offs);
|
||||||
if (len == 0) {
|
if (len == 0) {
|
||||||
len = utf8_len(text[offs]);
|
len = utf8_len(text[offs]);
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,13 +5,44 @@
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
std::string detokenize(llama_context * ctx, const llama_token * tokens, int count) {
|
static std::string escape_whitespace(const std::string& text) {
|
||||||
|
std::string result;
|
||||||
|
bool escaping = false;
|
||||||
|
result += char(0xe2);
|
||||||
|
result += char(0x96);
|
||||||
|
result += char(0x81);
|
||||||
|
for (size_t offs = 0; offs < text.length(); ++offs) {
|
||||||
|
if (text[offs] == ' ' || text[offs] == '\t' || text[offs] == '\n') {
|
||||||
|
if (!escaping) {
|
||||||
|
result += char(0xe2);
|
||||||
|
result += char(0x96);
|
||||||
|
result += char(0x81);
|
||||||
|
escaping = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
escaping = false;
|
||||||
|
result += text[offs];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string unescape_whitespace(llama_context* ctx, llama_token token) {
|
||||||
|
const char* word = llama_token_to_str(ctx, token);
|
||||||
|
if (strlen(word) >= 3 &&
|
||||||
|
word[0] == char(0xe2) &&
|
||||||
|
word[1] == char(0x96) &&
|
||||||
|
word[2] == char(0x81)) {
|
||||||
|
return std::string(" ") + (word + 3);
|
||||||
|
}
|
||||||
|
return word;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string unescape_whitespace(llama_context* ctx, const llama_token* tokens, int count) {
|
||||||
std::string result;
|
std::string result;
|
||||||
for (int i = 0; i < count; ++i) {
|
for (int i = 0; i < count; ++i) {
|
||||||
result += llama_token_to_str(ctx, tokens[i]);
|
result += unescape_whitespace(ctx, tokens[i]);
|
||||||
if (i < count - 1) {
|
|
||||||
result += "_";
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
@ -19,12 +50,14 @@ std::string detokenize(llama_context * ctx, const llama_token * tokens, int coun
|
||||||
static const std::map<std::string, std::vector<llama_token>> & k_tests()
|
static const std::map<std::string, std::vector<llama_token>> & k_tests()
|
||||||
{
|
{
|
||||||
static std::map<std::string, std::vector<llama_token>> _k_tests = {
|
static std::map<std::string, std::vector<llama_token>> _k_tests = {
|
||||||
{ "Hello World", { 1, 10994, 2787, }, },
|
{ "Hello world", { 1, 15043, 3186, }, },
|
||||||
{ " Hello World", { 1, 15043, 2787, }, },
|
{ " Hello world", { 1, 29871, 15043, 3186, }, },
|
||||||
{ " Hello World!", { 1, 15043, 2787, 29991, }, },
|
{ "Hello World", { 1, 15043, 2787, }, },
|
||||||
{ " this is 🦙.cpp", { 1, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
|
{ " Hello World", { 1, 29871, 15043, 2787, }, },
|
||||||
{ "w048 7tuijk dsdfhu", { 1, 29893, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
|
{" Hello World!", { 1, 29871, 15043, 2787, 29991, }, },
|
||||||
{ "нещо на Български", { 1, 821, 4851, 665, 1386, 29713, 1305, }, },
|
{" this is 🦙.cpp", { 1, 29871, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
|
||||||
|
{"w048 7tuijk dsdfhu", { 1, 281, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
|
||||||
|
{"нещо на Български", { 1, 1538, 4851, 665, 1386, 29713, 1305, }, },
|
||||||
};
|
};
|
||||||
return _k_tests;
|
return _k_tests;
|
||||||
};
|
};
|
||||||
|
@ -77,9 +110,9 @@ int main(int argc, char **argv) {
|
||||||
|
|
||||||
for (const auto & test_kv : k_tests()) {
|
for (const auto & test_kv : k_tests()) {
|
||||||
std::vector<llama_token> res(test_kv.first.size());
|
std::vector<llama_token> res(test_kv.first.size());
|
||||||
const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), int(res.size()), true);
|
const int n = llama_tokenize(ctx, escape_whitespace(test_kv.first.c_str()).c_str(), res.data(), int(res.size()), true);
|
||||||
fprintf(stderr, "%s : '%s' tokenized to '%s'\n",
|
fprintf(stderr, "%s : '%s' tokenized to '%s'\n",
|
||||||
__func__, test_kv.first.c_str(), detokenize(ctx, res.data(), n).c_str());
|
__func__, test_kv.first.c_str(), unescape_whitespace(ctx, res.data(), n).c_str());
|
||||||
res.resize(n);
|
res.resize(n);
|
||||||
|
|
||||||
bool correct = res.size() == test_kv.second.size();
|
bool correct = res.size() == test_kv.second.size();
|
||||||
|
|
|
@ -8,13 +8,44 @@
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
std::string detokenize(llama_context * ctx, const llama_token * tokens, int count) {
|
static std::string escape_whitespace(const std::string& text) {
|
||||||
|
std::string result;
|
||||||
|
bool escaping = false;
|
||||||
|
result += char(0xe2);
|
||||||
|
result += char(0x96);
|
||||||
|
result += char(0x81);
|
||||||
|
for (size_t offs = 0; offs < text.length(); ++offs) {
|
||||||
|
if (text[offs] == ' ' || text[offs] == '\t' || text[offs] == '\n') {
|
||||||
|
if (!escaping) {
|
||||||
|
result += char(0xe2);
|
||||||
|
result += char(0x96);
|
||||||
|
result += char(0x81);
|
||||||
|
escaping = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
escaping = false;
|
||||||
|
result += text[offs];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string unescape_whitespace(llama_context* ctx, llama_token token) {
|
||||||
|
const char* word = llama_token_to_str(ctx, token);
|
||||||
|
if (strlen(word) >= 3 &&
|
||||||
|
word[0] == char(0xe2) &&
|
||||||
|
word[1] == char(0x96) &&
|
||||||
|
word[2] == char(0x81)) {
|
||||||
|
return std::string(" ") + (word + 3);
|
||||||
|
}
|
||||||
|
return word;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string unescape_whitespace(llama_context* ctx, const llama_token* tokens, int count) {
|
||||||
std::string result;
|
std::string result;
|
||||||
for (int i = 0; i < count; ++i) {
|
for (int i = 0; i < count; ++i) {
|
||||||
result += llama_token_to_str(ctx, tokens[i]);
|
result += unescape_whitespace(ctx, tokens[i]);
|
||||||
if (i < count - 1) {
|
|
||||||
result += "_";
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
@ -66,22 +97,22 @@ int main(int argc, char **argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < n_vocab; ++i) {
|
for (int i = 0; i < n_vocab; ++i) {
|
||||||
const char * forward = llama_token_to_str(ctx, i);
|
std::string forward = llama_token_to_str(ctx, i);
|
||||||
std::vector<llama_token> tokens(strlen(forward));
|
std::vector<llama_token> tokens(forward.length());
|
||||||
auto n = llama_tokenize(ctx, forward, tokens.data(), strlen(forward), false);
|
int n = llama_tokenize(ctx, forward.c_str(), tokens.data(), forward.length(), false);
|
||||||
if (n == 1) {
|
if (n == 1) {
|
||||||
if (i != tokens[0]) {
|
if (i != tokens[0]) {
|
||||||
const char* backward = llama_token_to_str(ctx, tokens[0]);
|
std::string backward = unescape_whitespace(ctx, tokens[0]);
|
||||||
fprintf(stderr, "%s : error: token %d is string %s but tokenize() returns token %d %s\n",
|
fprintf(stderr, "%s : error: token %d is string %s but tokenize() returns token %d %s\n",
|
||||||
__func__, i, forward, tokens[0], backward);
|
__func__, i, unescape_whitespace(ctx, i).c_str(), tokens[0], backward.c_str());
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (i <= 258) {
|
if (i <= 258) {
|
||||||
fprintf(stderr, "%s : info: token %d is string %s and tokenize() returns tokens %s\n",
|
fprintf(stderr, "%s : info: token %d is string %s and tokenize() returns tokens %s\n",
|
||||||
__func__, i, forward, detokenize(ctx, tokens.data(), n).c_str());
|
__func__, i, unescape_whitespace(ctx, i).c_str(), unescape_whitespace(ctx, tokens.data(), n).c_str());
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "%s : error: token %d is string %s but tokenize() returns tokens %s\n",
|
fprintf(stderr, "%s : error: token %d is string %s but tokenize() returns tokens %s\n",
|
||||||
__func__, i, forward, detokenize(ctx, tokens.data(), n).c_str());
|
__func__, i, unescape_whitespace(ctx, i).c_str(), unescape_whitespace(ctx, tokens.data(), n).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -91,7 +122,7 @@ int main(int argc, char **argv) {
|
||||||
std::wstring wstr(1, ch);
|
std::wstring wstr(1, ch);
|
||||||
std::string str = converter.to_bytes(wstr);
|
std::string str = converter.to_bytes(wstr);
|
||||||
std::vector<llama_token> tokens(strlen(str.c_str()));
|
std::vector<llama_token> tokens(strlen(str.c_str()));
|
||||||
auto n = llama_tokenize(ctx, str.c_str(), tokens.data(), str.length(), false);
|
auto n = llama_tokenize(ctx, escape_whitespace(str).c_str(), tokens.data(), str.length(), false);
|
||||||
if (n == 1) {
|
if (n == 1) {
|
||||||
fprintf(stderr, "%s : info: %s tokenized to %d \n",
|
fprintf(stderr, "%s : info: %s tokenized to %d \n",
|
||||||
__func__, str.c_str(), tokens[0]);
|
__func__, str.c_str(), tokens[0]);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue