Revert back to C++11, avoid std::string_view in the tokenizer
This commit is contained in:
parent
ecd982ddc0
commit
a0d00bd809
4 changed files with 29 additions and 25 deletions
|
@ -221,14 +221,14 @@ add_library(utils OBJECT
|
||||||
utils.h)
|
utils.h)
|
||||||
|
|
||||||
target_include_directories(utils PUBLIC .)
|
target_include_directories(utils PUBLIC .)
|
||||||
target_compile_features(utils PUBLIC cxx_std_11)
|
target_compile_features(utils PUBLIC cxx_std_11) # don't bump
|
||||||
|
|
||||||
add_library(ggml OBJECT
|
add_library(ggml OBJECT
|
||||||
ggml.c
|
ggml.c
|
||||||
ggml.h)
|
ggml.h)
|
||||||
|
|
||||||
target_include_directories(ggml PUBLIC .)
|
target_include_directories(ggml PUBLIC .)
|
||||||
target_compile_features(ggml PUBLIC c_std_17)
|
target_compile_features(ggml PUBLIC c_std_11) # don't bump
|
||||||
|
|
||||||
#
|
#
|
||||||
# Linking
|
# Linking
|
||||||
|
|
3
Makefile
3
Makefile
|
@ -30,8 +30,9 @@ endif
|
||||||
# Compile flags
|
# Compile flags
|
||||||
#
|
#
|
||||||
|
|
||||||
|
# keep standard at C11 and C++11
|
||||||
CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
|
CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
|
||||||
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++17 -fPIC
|
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
|
||||||
LDFLAGS =
|
LDFLAGS =
|
||||||
|
|
||||||
# OS specific
|
# OS specific
|
||||||
|
|
45
utils.cpp
45
utils.cpp
|
@ -250,7 +250,8 @@ struct llama_sp_symbol {
|
||||||
using index = int;
|
using index = int;
|
||||||
index prev;
|
index prev;
|
||||||
index next;
|
index next;
|
||||||
std::string_view text;
|
const char * text;
|
||||||
|
size_t n;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_sp_bigram {
|
struct llama_sp_bigram {
|
||||||
|
@ -267,19 +268,23 @@ struct llama_sp_bigram {
|
||||||
size_t size;
|
size_t size;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// original implementation:
|
||||||
|
// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
|
||||||
struct llama_tokenizer {
|
struct llama_tokenizer {
|
||||||
llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {}
|
llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {}
|
||||||
|
|
||||||
void tokenize(std::string_view text, std::vector<llama_vocab::id> & output) {
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
||||||
// split string into utf8 chars
|
// split string into utf8 chars
|
||||||
int index = 0;
|
int index = 0;
|
||||||
while (!text.empty()) {
|
size_t offs = 0;
|
||||||
|
while (offs < text.size()) {
|
||||||
llama_sp_symbol sym;
|
llama_sp_symbol sym;
|
||||||
size_t char_len = std::min(text.size(), utf8_len(text.data()[0]));
|
size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
|
||||||
sym.text = std::string_view(text.data(), char_len);
|
sym.text = text.c_str() + offs;
|
||||||
|
sym.n = char_len;
|
||||||
|
offs += char_len;
|
||||||
sym.prev = index - 1;
|
sym.prev = index - 1;
|
||||||
text.remove_prefix(char_len);
|
sym.next = offs == text.size() ? -1 : index + 1;
|
||||||
sym.next = text.empty() ? -1 : index + 1;
|
|
||||||
index++;
|
index++;
|
||||||
symbols_.emplace_back(std::move(sym));
|
symbols_.emplace_back(std::move(sym));
|
||||||
}
|
}
|
||||||
|
@ -298,14 +303,16 @@ struct llama_tokenizer {
|
||||||
auto & right_sym = symbols_[bigram.right];
|
auto & right_sym = symbols_[bigram.right];
|
||||||
|
|
||||||
// if one of the symbols already got merged, skip it.
|
// if one of the symbols already got merged, skip it.
|
||||||
if (left_sym.text.empty() || right_sym.text.empty() ||
|
if (left_sym.n == 0 || right_sym.n == 0 ||
|
||||||
left_sym.text.size() + right_sym.text.size() != bigram.size) {
|
left_sym.n + right_sym.n != bigram.size) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// merge the right sym into the left one
|
// merge the right sym into the left one
|
||||||
left_sym.text = std::string_view(left_sym.text.data(), left_sym.text.size() + right_sym.text.size());
|
left_sym.n += right_sym.n;
|
||||||
right_sym.text = std::string_view("");
|
right_sym.n = 0;
|
||||||
|
|
||||||
|
//printf("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
|
||||||
|
|
||||||
// remove the right sym from the chain
|
// remove the right sym from the chain
|
||||||
left_sym.next = right_sym.next;
|
left_sym.next = right_sym.next;
|
||||||
|
@ -319,12 +326,12 @@ struct llama_tokenizer {
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i != -1; i = symbols_[i].next) {
|
for (int i = 0; i != -1; i = symbols_[i].next) {
|
||||||
auto& symbol = symbols_[i];
|
auto & symbol = symbols_[i];
|
||||||
auto token = vocab_.token_to_id.find(std::string(symbol.text));
|
auto token = vocab_.token_to_id.find(std::string(symbol.text, symbol.n));
|
||||||
|
|
||||||
if (token == vocab_.token_to_id.end()) {
|
if (token == vocab_.token_to_id.end()) {
|
||||||
// output any symbols that did not form tokens as bytes.
|
// output any symbols that did not form tokens as bytes.
|
||||||
for (int j = 0; j < symbol.text.size(); ++j) {
|
for (int j = 0; j < (int) symbol.n; ++j) {
|
||||||
llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
|
llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
|
||||||
output.push_back(token_id);
|
output.push_back(token_id);
|
||||||
}
|
}
|
||||||
|
@ -340,8 +347,8 @@ private:
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string_view text(symbols_[left].text.data(), symbols_[left].text.size() + symbols_[right].text.size());
|
const std::string text = std::string(symbols_[left].text, symbols_[left].n + symbols_[right].n);
|
||||||
auto token = vocab_.token_to_id.find(std::string(text));
|
auto token = vocab_.token_to_id.find(text);
|
||||||
|
|
||||||
if (token == vocab_.token_to_id.end()) {
|
if (token == vocab_.token_to_id.end()) {
|
||||||
return;
|
return;
|
||||||
|
@ -399,16 +406,12 @@ bool llama_vocab_load(const std::string & fname, llama_vocab & vocab) {
|
||||||
vocab.token_to_id[word] = i;
|
vocab.token_to_id[word] = i;
|
||||||
vocab.id_to_token[i] = word;
|
vocab.id_to_token[i] = word;
|
||||||
vocab.score[i] = score;
|
vocab.score[i] = score;
|
||||||
|
|
||||||
//if (i < 30000) {
|
|
||||||
// fprintf(stderr, "%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
|
|
||||||
//}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, std::string_view text, bool bos) {
|
std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos) {
|
||||||
llama_tokenizer tokenizer(vocab);
|
llama_tokenizer tokenizer(vocab);
|
||||||
std::vector<llama_vocab::id> output;
|
std::vector<llama_vocab::id> output;
|
||||||
|
|
||||||
|
|
2
utils.h
2
utils.h
|
@ -79,7 +79,7 @@ bool llama_vocab_load(const std::string & fname, llama_vocab & vocab);
|
||||||
|
|
||||||
// TODO: this is probably wrong, but I cannot figure out how this tokenizer works ..
|
// TODO: this is probably wrong, but I cannot figure out how this tokenizer works ..
|
||||||
// ref: https://github.com/google/sentencepiece
|
// ref: https://github.com/google/sentencepiece
|
||||||
std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, std::string_view text, bool bos);
|
std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos);
|
||||||
|
|
||||||
// sample next token given probabilities for each embedding
|
// sample next token given probabilities for each embedding
|
||||||
//
|
//
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue