Revert back to C++11

This commit is contained in:
Georgi Gerganov 2023-03-21 11:34:58 +02:00
parent ecd982ddc0
commit 11d84b2ed9
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
4 changed files with 22 additions and 15 deletions

View file

@ -221,14 +221,14 @@ add_library(utils OBJECT
utils.h) utils.h)
target_include_directories(utils PUBLIC .) target_include_directories(utils PUBLIC .)
target_compile_features(utils PUBLIC cxx_std_11) target_compile_features(utils PUBLIC cxx_std_11) # don't bump
add_library(ggml OBJECT add_library(ggml OBJECT
ggml.c ggml.c
ggml.h) ggml.h)
target_include_directories(ggml PUBLIC .) target_include_directories(ggml PUBLIC .)
target_compile_features(ggml PUBLIC c_std_17) target_compile_features(ggml PUBLIC c_std_11) # don't bump
# #
# Linking # Linking

View file

@ -30,8 +30,9 @@ endif
# Compile flags # Compile flags
# #
# keep standard at C11 and C++11
CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++17 -fPIC CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
LDFLAGS = LDFLAGS =
# OS specific # OS specific

View file

@ -250,7 +250,7 @@ struct llama_sp_symbol {
using index = int; using index = int;
index prev; index prev;
index next; index next;
std::string_view text; std::string text;
}; };
struct llama_sp_bigram { struct llama_sp_bigram {
@ -270,15 +270,17 @@ struct llama_sp_bigram {
struct llama_tokenizer { struct llama_tokenizer {
llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {} llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {}
void tokenize(std::string_view text, std::vector<llama_vocab::id> & output) { void tokenize(std::string text, std::vector<llama_vocab::id> & output) {
// split string into utf8 chars // split string into utf8 chars
int index = 0; int index = 0;
while (!text.empty()) { while (!text.empty()) {
llama_sp_symbol sym; llama_sp_symbol sym;
size_t char_len = std::min(text.size(), utf8_len(text.data()[0])); size_t char_len = std::min(text.size(), utf8_len(text.data()[0]));
sym.text = std::string_view(text.data(), char_len); //sym.text = std::string_view(text.data(), char_len);
sym.text = text.substr(0, char_len);
sym.prev = index - 1; sym.prev = index - 1;
text.remove_prefix(char_len); //text.remove_prefix(char_len);
text = text.substr(char_len);
sym.next = text.empty() ? -1 : index + 1; sym.next = text.empty() ? -1 : index + 1;
index++; index++;
symbols_.emplace_back(std::move(sym)); symbols_.emplace_back(std::move(sym));
@ -304,8 +306,9 @@ struct llama_tokenizer {
} }
// merge the right sym into the left one // merge the right sym into the left one
left_sym.text = std::string_view(left_sym.text.data(), left_sym.text.size() + right_sym.text.size()); //left_sym.text = std::string_view(left_sym.text.data(), left_sym.text.size() + right_sym.text.size());
right_sym.text = std::string_view(""); //right_sym.text = std::string_view("");
left_sym.text += right_sym.text;
// remove the right sym from the chain // remove the right sym from the chain
left_sym.next = right_sym.next; left_sym.next = right_sym.next;
@ -319,12 +322,12 @@ struct llama_tokenizer {
} }
for (int i = 0; i != -1; i = symbols_[i].next) { for (int i = 0; i != -1; i = symbols_[i].next) {
auto& symbol = symbols_[i]; auto & symbol = symbols_[i];
auto token = vocab_.token_to_id.find(std::string(symbol.text)); auto token = vocab_.token_to_id.find(std::string(symbol.text));
if (token == vocab_.token_to_id.end()) { if (token == vocab_.token_to_id.end()) {
// output any symbols that did not form tokens as bytes. // output any symbols that did not form tokens as bytes.
for (int j = 0; j < symbol.text.size(); ++j) { for (int j = 0; j < (int) symbol.text.size(); ++j) {
llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3; llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
output.push_back(token_id); output.push_back(token_id);
} }
@ -340,8 +343,11 @@ private:
return; return;
} }
std::string_view text(symbols_[left].text.data(), symbols_[left].text.size() + symbols_[right].text.size()); //std::string_view text(symbols_[left].text.data(), symbols_[left].text.size() + symbols_[right].text.size());
auto token = vocab_.token_to_id.find(std::string(text)); //auto token = vocab_.token_to_id.find(std::string(text));
const std::string text = symbols_[left].text + symbols_[right].text;
auto token = vocab_.token_to_id.find(text);
if (token == vocab_.token_to_id.end()) { if (token == vocab_.token_to_id.end()) {
return; return;
@ -408,7 +414,7 @@ bool llama_vocab_load(const std::string & fname, llama_vocab & vocab) {
return true; return true;
} }
std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, std::string_view text, bool bos) { std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos) {
llama_tokenizer tokenizer(vocab); llama_tokenizer tokenizer(vocab);
std::vector<llama_vocab::id> output; std::vector<llama_vocab::id> output;

View file

@ -79,7 +79,7 @@ bool llama_vocab_load(const std::string & fname, llama_vocab & vocab);
// TODO: this is probably wrong, but I cannot figure out how this tokenizer works .. // TODO: this is probably wrong, but I cannot figure out how this tokenizer works ..
// ref: https://github.com/google/sentencepiece // ref: https://github.com/google/sentencepiece
std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, std::string_view text, bool bos); std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos);
// sample next token given probabilities for each embedding // sample next token given probabilities for each embedding
// //