Added whitespace escaping and unescaping
Now we see some resemblence to the Meta-Tokenizer, I think. Only problem: how to integrate this into `llama.cpp` kernel.
This commit is contained in:
parent
94a0ee1eb8
commit
0e74a7222e
4 changed files with 94 additions and 35 deletions
|
@ -233,12 +233,7 @@ class SentencePieceVocab:
|
|||
for i in range(tokenizer.vocab_size()):
|
||||
# TODO: How do we want to support is_unknown, is_control, is_byte and is_unused?
|
||||
piece = tokenizer.id_to_piece(i)
|
||||
text: bytes
|
||||
if tokenizer.is_unknown(i) or tokenizer.is_control(i) or tokenizer.is_byte(i):
|
||||
text: bytes = piece.encode("utf-8")
|
||||
else:
|
||||
text = piece.replace("\u2581", " ").encode("utf-8")
|
||||
|
||||
text: bytes = piece.encode("utf-8")
|
||||
score: float = tokenizer.get_score(i)
|
||||
yield text, score
|
||||
|
||||
|
|
|
@ -1832,13 +1832,13 @@ struct llama_tokenizer {
|
|||
llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {}
|
||||
|
||||
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
||||
// split string into utf8 chars
|
||||
// split string into utf8 chars / token?
|
||||
int index = 0;
|
||||
size_t offs = 0;
|
||||
while (offs < text.size()) {
|
||||
llama_sp_symbol sym;
|
||||
// size_t len = utf8_len(text[offs]);
|
||||
size_t len = llama_trie_find(vocab_.trie, text, offs);
|
||||
size_t len = utf8_len(text[offs]);
|
||||
// size_t len = llama_trie_find(vocab_.trie, text, offs);
|
||||
if (len == 0) {
|
||||
len = utf8_len(text[offs]);
|
||||
}
|
||||
|
|
|
@ -5,13 +5,44 @@
|
|||
#include <map>
|
||||
#include <vector>
|
||||
|
||||
std::string detokenize(llama_context * ctx, const llama_token * tokens, int count) {
|
||||
static std::string escape_whitespace(const std::string& text) {
|
||||
std::string result;
|
||||
bool escaping = false;
|
||||
result += char(0xe2);
|
||||
result += char(0x96);
|
||||
result += char(0x81);
|
||||
for (size_t offs = 0; offs < text.length(); ++offs) {
|
||||
if (text[offs] == ' ' || text[offs] == '\t' || text[offs] == '\n') {
|
||||
if (!escaping) {
|
||||
result += char(0xe2);
|
||||
result += char(0x96);
|
||||
result += char(0x81);
|
||||
escaping = true;
|
||||
}
|
||||
}
|
||||
else {
|
||||
escaping = false;
|
||||
result += text[offs];
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static std::string unescape_whitespace(llama_context* ctx, llama_token token) {
|
||||
const char* word = llama_token_to_str(ctx, token);
|
||||
if (strlen(word) >= 3 &&
|
||||
word[0] == char(0xe2) &&
|
||||
word[1] == char(0x96) &&
|
||||
word[2] == char(0x81)) {
|
||||
return std::string(" ") + (word + 3);
|
||||
}
|
||||
return word;
|
||||
}
|
||||
|
||||
static std::string unescape_whitespace(llama_context* ctx, const llama_token* tokens, int count) {
|
||||
std::string result;
|
||||
for (int i = 0; i < count; ++i) {
|
||||
result += llama_token_to_str(ctx, tokens[i]);
|
||||
if (i < count - 1) {
|
||||
result += "_";
|
||||
}
|
||||
result += unescape_whitespace(ctx, tokens[i]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
@ -19,12 +50,14 @@ std::string detokenize(llama_context * ctx, const llama_token * tokens, int coun
|
|||
static const std::map<std::string, std::vector<llama_token>> & k_tests()
|
||||
{
|
||||
static std::map<std::string, std::vector<llama_token>> _k_tests = {
|
||||
{ "Hello World", { 1, 10994, 2787, }, },
|
||||
{ " Hello World", { 1, 15043, 2787, }, },
|
||||
{ " Hello World!", { 1, 15043, 2787, 29991, }, },
|
||||
{ " this is 🦙.cpp", { 1, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
|
||||
{ "w048 7tuijk dsdfhu", { 1, 29893, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
|
||||
{ "нещо на Български", { 1, 821, 4851, 665, 1386, 29713, 1305, }, },
|
||||
{ "Hello world", { 1, 15043, 3186, }, },
|
||||
{ " Hello world", { 1, 29871, 15043, 3186, }, },
|
||||
{ "Hello World", { 1, 15043, 2787, }, },
|
||||
{ " Hello World", { 1, 29871, 15043, 2787, }, },
|
||||
{" Hello World!", { 1, 29871, 15043, 2787, 29991, }, },
|
||||
{" this is 🦙.cpp", { 1, 29871, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
|
||||
{"w048 7tuijk dsdfhu", { 1, 281, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
|
||||
{"нещо на Български", { 1, 1538, 4851, 665, 1386, 29713, 1305, }, },
|
||||
};
|
||||
return _k_tests;
|
||||
};
|
||||
|
@ -77,9 +110,9 @@ int main(int argc, char **argv) {
|
|||
|
||||
for (const auto & test_kv : k_tests()) {
|
||||
std::vector<llama_token> res(test_kv.first.size());
|
||||
const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), int(res.size()), true);
|
||||
const int n = llama_tokenize(ctx, escape_whitespace(test_kv.first.c_str()).c_str(), res.data(), int(res.size()), true);
|
||||
fprintf(stderr, "%s : '%s' tokenized to '%s'\n",
|
||||
__func__, test_kv.first.c_str(), detokenize(ctx, res.data(), n).c_str());
|
||||
__func__, test_kv.first.c_str(), unescape_whitespace(ctx, res.data(), n).c_str());
|
||||
res.resize(n);
|
||||
|
||||
bool correct = res.size() == test_kv.second.size();
|
||||
|
|
|
@ -8,13 +8,44 @@
|
|||
#include <map>
|
||||
#include <vector>
|
||||
|
||||
std::string detokenize(llama_context * ctx, const llama_token * tokens, int count) {
|
||||
static std::string escape_whitespace(const std::string& text) {
|
||||
std::string result;
|
||||
bool escaping = false;
|
||||
result += char(0xe2);
|
||||
result += char(0x96);
|
||||
result += char(0x81);
|
||||
for (size_t offs = 0; offs < text.length(); ++offs) {
|
||||
if (text[offs] == ' ' || text[offs] == '\t' || text[offs] == '\n') {
|
||||
if (!escaping) {
|
||||
result += char(0xe2);
|
||||
result += char(0x96);
|
||||
result += char(0x81);
|
||||
escaping = true;
|
||||
}
|
||||
}
|
||||
else {
|
||||
escaping = false;
|
||||
result += text[offs];
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static std::string unescape_whitespace(llama_context* ctx, llama_token token) {
|
||||
const char* word = llama_token_to_str(ctx, token);
|
||||
if (strlen(word) >= 3 &&
|
||||
word[0] == char(0xe2) &&
|
||||
word[1] == char(0x96) &&
|
||||
word[2] == char(0x81)) {
|
||||
return std::string(" ") + (word + 3);
|
||||
}
|
||||
return word;
|
||||
}
|
||||
|
||||
static std::string unescape_whitespace(llama_context* ctx, const llama_token* tokens, int count) {
|
||||
std::string result;
|
||||
for (int i = 0; i < count; ++i) {
|
||||
result += llama_token_to_str(ctx, tokens[i]);
|
||||
if (i < count - 1) {
|
||||
result += "_";
|
||||
}
|
||||
result += unescape_whitespace(ctx, tokens[i]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
@ -66,22 +97,22 @@ int main(int argc, char **argv) {
|
|||
}
|
||||
|
||||
for (int i = 0; i < n_vocab; ++i) {
|
||||
const char * forward = llama_token_to_str(ctx, i);
|
||||
std::vector<llama_token> tokens(strlen(forward));
|
||||
auto n = llama_tokenize(ctx, forward, tokens.data(), strlen(forward), false);
|
||||
std::string forward = llama_token_to_str(ctx, i);
|
||||
std::vector<llama_token> tokens(forward.length());
|
||||
int n = llama_tokenize(ctx, forward.c_str(), tokens.data(), forward.length(), false);
|
||||
if (n == 1) {
|
||||
if (i != tokens[0]) {
|
||||
const char* backward = llama_token_to_str(ctx, tokens[0]);
|
||||
std::string backward = unescape_whitespace(ctx, tokens[0]);
|
||||
fprintf(stderr, "%s : error: token %d is string %s but tokenize() returns token %d %s\n",
|
||||
__func__, i, forward, tokens[0], backward);
|
||||
__func__, i, unescape_whitespace(ctx, i).c_str(), tokens[0], backward.c_str());
|
||||
}
|
||||
} else {
|
||||
if (i <= 258) {
|
||||
fprintf(stderr, "%s : info: token %d is string %s and tokenize() returns tokens %s\n",
|
||||
__func__, i, forward, detokenize(ctx, tokens.data(), n).c_str());
|
||||
__func__, i, unescape_whitespace(ctx, i).c_str(), unescape_whitespace(ctx, tokens.data(), n).c_str());
|
||||
} else {
|
||||
fprintf(stderr, "%s : error: token %d is string %s but tokenize() returns tokens %s\n",
|
||||
__func__, i, forward, detokenize(ctx, tokens.data(), n).c_str());
|
||||
__func__, i, unescape_whitespace(ctx, i).c_str(), unescape_whitespace(ctx, tokens.data(), n).c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -91,7 +122,7 @@ int main(int argc, char **argv) {
|
|||
std::wstring wstr(1, ch);
|
||||
std::string str = converter.to_bytes(wstr);
|
||||
std::vector<llama_token> tokens(strlen(str.c_str()));
|
||||
auto n = llama_tokenize(ctx, str.c_str(), tokens.data(), str.length(), false);
|
||||
auto n = llama_tokenize(ctx, escape_whitespace(str).c_str(), tokens.data(), str.length(), false);
|
||||
if (n == 1) {
|
||||
fprintf(stderr, "%s : info: %s tokenized to %d \n",
|
||||
__func__, str.c_str(), tokens[0]);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue