Add tokenizer test + revert to C++11 (#355)
* Add test-tokenizer-0 to do a few tokenizations - feel free to expand * Added option to convert-pth-to-ggml.py script to dump just the vocabulary * Added ./models/ggml-vocab.bin containing just LLaMA vocab data (used for tests) * Added utility to load vocabulary file from previous point (temporary implementation) * Avoid using std::string_view and drop back to C++11 (hope I didn't break something) * Rename gpt_vocab -> llama_vocab * All CMake binaries go into ./bin/ now
This commit is contained in:
parent
2e664f1ff4
commit
eb34620aec
11 changed files with 249 additions and 148 deletions
174
utils.cpp
174
utils.cpp
|
@ -240,61 +240,6 @@ std::map<std::string, int32_t> json_parse(const std::string & fname) {
|
|||
return result;
|
||||
}
|
||||
|
||||
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
|
||||
std::vector<std::string> words;
|
||||
|
||||
// first split the text into words
|
||||
{
|
||||
std::string str = text;
|
||||
std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
|
||||
|
||||
std::regex re(pat);
|
||||
std::smatch m;
|
||||
|
||||
while (std::regex_search(str, m, re)) {
|
||||
for (auto x : m) {
|
||||
words.push_back(x);
|
||||
}
|
||||
str = m.suffix();
|
||||
}
|
||||
}
|
||||
|
||||
// find the longest tokens that form the words:
|
||||
std::vector<gpt_vocab::id> tokens;
|
||||
for (const auto & word : words) {
|
||||
if (word.size() == 0) continue;
|
||||
|
||||
int i = 0;
|
||||
int n = word.size();
|
||||
while (i < n) {
|
||||
int j = n;
|
||||
while (j > i) {
|
||||
auto it = vocab.token_to_id.find(word.substr(i, j-i));
|
||||
if (it != vocab.token_to_id.end()) {
|
||||
tokens.push_back(it->second);
|
||||
i = j;
|
||||
break;
|
||||
}
|
||||
--j;
|
||||
}
|
||||
if (i == n) {
|
||||
break;
|
||||
}
|
||||
if (j == i) {
|
||||
auto sub = word.substr(i, 1);
|
||||
if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
|
||||
tokens.push_back(vocab.token_to_id.at(sub));
|
||||
} else {
|
||||
fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
|
||||
}
|
||||
++i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tokens;
|
||||
}
|
||||
|
||||
static size_t utf8_len(char src) {
|
||||
const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
|
||||
uint8_t highbits = static_cast<uint8_t>(src) >> 4;
|
||||
|
@ -305,7 +250,8 @@ struct llama_sp_symbol {
|
|||
using index = int;
|
||||
index prev;
|
||||
index next;
|
||||
std::string_view text;
|
||||
const char * text;
|
||||
size_t n;
|
||||
};
|
||||
|
||||
struct llama_sp_bigram {
|
||||
|
@ -322,19 +268,23 @@ struct llama_sp_bigram {
|
|||
size_t size;
|
||||
};
|
||||
|
||||
// original implementation:
|
||||
// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
|
||||
struct llama_tokenizer {
|
||||
llama_tokenizer(const gpt_vocab & vocab): vocab_(vocab) {}
|
||||
llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {}
|
||||
|
||||
void tokenize(std::string_view text, std::vector<gpt_vocab::id> & output) {
|
||||
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
||||
// split string into utf8 chars
|
||||
int index = 0;
|
||||
while (!text.empty()) {
|
||||
size_t offs = 0;
|
||||
while (offs < text.size()) {
|
||||
llama_sp_symbol sym;
|
||||
size_t char_len = std::min(text.size(), utf8_len(text.data()[0]));
|
||||
sym.text = std::string_view(text.data(), char_len);
|
||||
size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
|
||||
sym.text = text.c_str() + offs;
|
||||
sym.n = char_len;
|
||||
offs += char_len;
|
||||
sym.prev = index - 1;
|
||||
text.remove_prefix(char_len);
|
||||
sym.next = text.empty() ? -1 : index + 1;
|
||||
sym.next = offs == text.size() ? -1 : index + 1;
|
||||
index++;
|
||||
symbols_.emplace_back(std::move(sym));
|
||||
}
|
||||
|
@ -353,14 +303,16 @@ struct llama_tokenizer {
|
|||
auto & right_sym = symbols_[bigram.right];
|
||||
|
||||
// if one of the symbols already got merged, skip it.
|
||||
if (left_sym.text.empty() || right_sym.text.empty() ||
|
||||
left_sym.text.size() + right_sym.text.size() != bigram.size) {
|
||||
if (left_sym.n == 0 || right_sym.n == 0 ||
|
||||
left_sym.n + right_sym.n != bigram.size) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// merge the right sym into the left one
|
||||
left_sym.text = std::string_view(left_sym.text.data(), left_sym.text.size() + right_sym.text.size());
|
||||
right_sym.text = std::string_view("");
|
||||
left_sym.n += right_sym.n;
|
||||
right_sym.n = 0;
|
||||
|
||||
//printf("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
|
||||
|
||||
// remove the right sym from the chain
|
||||
left_sym.next = right_sym.next;
|
||||
|
@ -374,13 +326,13 @@ struct llama_tokenizer {
|
|||
}
|
||||
|
||||
for (int i = 0; i != -1; i = symbols_[i].next) {
|
||||
auto& symbol = symbols_[i];
|
||||
auto token = vocab_.token_to_id.find(std::string(symbol.text));
|
||||
auto & symbol = symbols_[i];
|
||||
auto token = vocab_.token_to_id.find(std::string(symbol.text, symbol.n));
|
||||
|
||||
if (token == vocab_.token_to_id.end()) {
|
||||
// output any symbols that did not form tokens as bytes.
|
||||
for (int j = 0; j < symbol.text.size(); ++j) {
|
||||
gpt_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
|
||||
for (int j = 0; j < (int) symbol.n; ++j) {
|
||||
llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
|
||||
output.push_back(token_id);
|
||||
}
|
||||
} else {
|
||||
|
@ -395,8 +347,8 @@ private:
|
|||
return;
|
||||
}
|
||||
|
||||
std::string_view text(symbols_[left].text.data(), symbols_[left].text.size() + symbols_[right].text.size());
|
||||
auto token = vocab_.token_to_id.find(std::string(text));
|
||||
const std::string text = std::string(symbols_[left].text, symbols_[left].n + symbols_[right].n);
|
||||
auto token = vocab_.token_to_id.find(text);
|
||||
|
||||
if (token == vocab_.token_to_id.end()) {
|
||||
return;
|
||||
|
@ -416,14 +368,52 @@ private:
|
|||
work_queue_.push(bigram);
|
||||
}
|
||||
|
||||
const gpt_vocab & vocab_;
|
||||
const llama_vocab & vocab_;
|
||||
std::vector<llama_sp_symbol> symbols_;
|
||||
llama_sp_bigram::queue work_queue_;
|
||||
};
|
||||
|
||||
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, std::string_view text, bool bos) {
|
||||
// TODO: temporary code duplication with llama.cpp
|
||||
// will resolve after #77 is merged
|
||||
bool llama_vocab_load(const std::string & fname, llama_vocab & vocab) {
|
||||
std::ifstream fin(fname, std::ios::binary);
|
||||
if (!fin.is_open()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int n_vocab = 0;
|
||||
fin.read((char *) &n_vocab, sizeof(n_vocab));
|
||||
|
||||
std::string word;
|
||||
std::vector<char> tmp(64);
|
||||
|
||||
for (int i = 0; i < n_vocab; i++) {
|
||||
uint32_t len;
|
||||
fin.read((char *) &len, sizeof(len));
|
||||
|
||||
word.resize(len);
|
||||
if (len > 0) {
|
||||
tmp.resize(len);
|
||||
fin.read(tmp.data(), len);
|
||||
word.assign(tmp.data(), len);
|
||||
} else {
|
||||
word.clear();
|
||||
}
|
||||
|
||||
float score;
|
||||
fin.read((char *) &score, sizeof(score));
|
||||
|
||||
vocab.token_to_id[word] = i;
|
||||
vocab.id_to_token[i] = word;
|
||||
vocab.score[i] = score;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos) {
|
||||
llama_tokenizer tokenizer(vocab);
|
||||
std::vector<gpt_vocab::id> output;
|
||||
std::vector<llama_vocab::id> output;
|
||||
|
||||
if (text.size() == 0) {
|
||||
return output;
|
||||
|
@ -437,42 +427,22 @@ std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, std::string_v
|
|||
return output;
|
||||
}
|
||||
|
||||
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
|
||||
printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
|
||||
|
||||
vocab.token_to_id = ::json_parse(fname);
|
||||
|
||||
for (const auto & kv : vocab.token_to_id) {
|
||||
vocab.id_to_token[kv.second] = kv.first;
|
||||
}
|
||||
|
||||
printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size());
|
||||
|
||||
// print the vocabulary
|
||||
//for (auto kv : vocab.token_to_id) {
|
||||
// printf("'%s' -> %d\n", kv.first.data(), kv.second);
|
||||
//}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
void sample_top_k(std::vector<std::pair<double, gpt_vocab::id>> & logits_id, int top_k) {
|
||||
void sample_top_k(std::vector<std::pair<double, llama_vocab::id>> & logits_id, int top_k) {
|
||||
// find the top K tokens
|
||||
std::partial_sort(
|
||||
logits_id.begin(),
|
||||
logits_id.begin() + top_k, logits_id.end(),
|
||||
[](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
|
||||
[](const std::pair<double, llama_vocab::id> & a, const std::pair<double, llama_vocab::id> & b) {
|
||||
return a.first > b.first;
|
||||
});
|
||||
|
||||
logits_id.resize(top_k);
|
||||
}
|
||||
|
||||
gpt_vocab::id llama_sample_top_p_top_k(
|
||||
const gpt_vocab & vocab,
|
||||
llama_vocab::id llama_sample_top_p_top_k(
|
||||
const llama_vocab & vocab,
|
||||
const float * logits,
|
||||
std::vector<gpt_vocab::id> & last_n_tokens,
|
||||
std::vector<llama_vocab::id> & last_n_tokens,
|
||||
double repeat_penalty,
|
||||
int top_k,
|
||||
double top_p,
|
||||
|
@ -480,7 +450,7 @@ gpt_vocab::id llama_sample_top_p_top_k(
|
|||
std::mt19937 & rng) {
|
||||
int n_logits = vocab.id_to_token.size();
|
||||
|
||||
std::vector<std::pair<double, gpt_vocab::id>> logits_id;
|
||||
std::vector<std::pair<double, llama_vocab::id>> logits_id;
|
||||
logits_id.reserve(n_logits);
|
||||
|
||||
{
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue