Fix coding style

This commit is contained in:
goerch 2023-10-02 13:01:46 +02:00 committed by GitHub
parent 3d162cc8ad
commit 5aee498d97
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 15 additions and 15 deletions

View file

@ -4590,7 +4590,7 @@ private:
work_queue.push(bigram); work_queue.push(bigram);
} }
std::vector<std::string> bpe_gpt2_preprocess(const std::string& text) { std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
std::vector<std::string> bpe_words; std::vector<std::string> bpe_words;
std::vector<std::string> bpe_encoded_words; std::vector<std::string> bpe_encoded_words;
@ -4612,13 +4612,13 @@ private:
text_utf.emplace_back(codepoint_to_utf8(cps[i])); text_utf.emplace_back(codepoint_to_utf8(cps[i]));
for (int i = 0; i < (int)text_utf.size(); i++) { for (int i = 0; i < (int)text_utf.size(); i++) {
const std::string& utf_char = text_utf[i]; const std::string & utf_char = text_utf[i];
bool split_condition = false; bool split_condition = false;
// const char* text_pos = raw_text_p + utf_char.seq_offset_bytes; // const char* text_pos = raw_text_p + utf_char.seq_offset_bytes;
int bytes_remain = text_utf.size() - i; int bytes_remain = text_utf.size() - i;
// forward backward lookups // forward backward lookups
const std::string& utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : ""; const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
const std::string& utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : ""; const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
// handling contractions // handling contractions
if (!split_condition && bytes_remain >= 2) { if (!split_condition && bytes_remain >= 2) {
@ -4719,9 +4719,9 @@ private:
} }
} }
for (std::string& word : bpe_words) { for (std::string & word : bpe_words) {
std::string encoded_token = ""; std::string encoded_token = "";
for (char& c : word) { for (char & c : word) {
encoded_token += bytes_to_unicode_bpe(c); encoded_token += bytes_to_unicode_bpe(c);
} }
bpe_encoded_words.emplace_back(encoded_token); bpe_encoded_words.emplace_back(encoded_token);
@ -7654,7 +7654,7 @@ int llama_tokenize(
return res.size(); return res.size();
} }
static std::string llama_decode_text(const std::string& text) { static std::string llama_decode_text(const std::string & text) {
std::string decoded_text; std::string decoded_text;
auto unicode_sequences = codepoints_from_utf8(text); auto unicode_sequences = codepoints_from_utf8(text);
for (auto& unicode_sequence : unicode_sequences) { for (auto& unicode_sequence : unicode_sequences) {

View file

@ -73,7 +73,7 @@ int main(int argc, char **argv) {
return 2; return 2;
} }
} }
catch (const std::invalid_argument&) { catch (const std::invalid_argument &) {
fprintf(stderr, "%s : info: utf8 conversion %d '%s'\n", __func__, i, str.c_str()); fprintf(stderr, "%s : info: utf8 conversion %d '%s'\n", __func__, i, str.c_str());
} }
} }

View file

@ -248,7 +248,7 @@ static std::string codepoint_to_utf8(uint32_t cp) {
return result; return result;
} }
static std::string codepoints_to_utf8(const std::vector<uint32_t>& cps) { static std::string codepoints_to_utf8(const std::vector<uint32_t> & cps) {
std::string result; std::string result;
for (size_t i = 0; i < cps.size(); ++i) { for (size_t i = 0; i < cps.size(); ++i) {
result.append(codepoint_to_utf8(cps[i])); result.append(codepoint_to_utf8(cps[i]));
@ -256,7 +256,7 @@ static std::string codepoints_to_utf8(const std::vector<uint32_t>& cps) {
return result; return result;
} }
static uint32_t codepoint_from_utf8(const std::string& utf8, size_t& offset) { static uint32_t codepoint_from_utf8(const std::string & utf8, size_t & offset) {
assert(offset < utf8.size()); assert(offset < utf8.size());
if (!(utf8[offset + 0] & 0x80)) { if (!(utf8[offset + 0] & 0x80)) {
auto result = utf8[offset + 0]; auto result = utf8[offset + 0];
@ -290,7 +290,7 @@ static uint32_t codepoint_from_utf8(const std::string& utf8, size_t& offset) {
throw std::invalid_argument("invalid string"); throw std::invalid_argument("invalid string");
} }
static std::vector<uint32_t> codepoints_from_utf8(const std::string& utf8) { static std::vector<uint32_t> codepoints_from_utf8(const std::string & utf8) {
std::vector<uint32_t> result; std::vector<uint32_t> result;
size_t offset = 0; size_t offset = 0;
while (offset < utf8.size()) { while (offset < utf8.size()) {
@ -314,7 +314,7 @@ static std::vector<uint16_t> codepoint_to_utf16(uint32_t cp) {
return result; return result;
} }
static std::vector<uint16_t> codepoints_to_utf16(const std::vector<uint32_t>& cps) { static std::vector<uint16_t> codepoints_to_utf16(const std::vector<uint32_t> & cps) {
std::vector<uint16_t> result; std::vector<uint16_t> result;
for (size_t i = 0; i < cps.size(); ++i) { for (size_t i = 0; i < cps.size(); ++i) {
auto temp = codepoint_to_utf16(cps[i]); auto temp = codepoint_to_utf16(cps[i]);
@ -323,7 +323,7 @@ static std::vector<uint16_t> codepoints_to_utf16(const std::vector<uint32_t>& cp
return result; return result;
} }
static uint32_t codepoint_from_utf16(const std::vector<uint16_t>& utf16, size_t& offset) { static uint32_t codepoint_from_utf16(const std::vector<uint16_t> & utf16, size_t & offset) {
assert(offset < utf16.size()); assert(offset < utf16.size());
if (((utf16[0] >> 10) << 10) != 0xd800) { if (((utf16[0] >> 10) << 10) != 0xd800) {
auto result = utf16[offset + 0]; auto result = utf16[offset + 0];
@ -340,7 +340,7 @@ static uint32_t codepoint_from_utf16(const std::vector<uint16_t>& utf16, size_t&
throw std::invalid_argument("invalid string"); throw std::invalid_argument("invalid string");
} }
static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t>& utf16) { static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> & utf16) {
std::vector<uint32_t> result; std::vector<uint32_t> result;
size_t offset = 0; size_t offset = 0;
while (offset < utf16.size()) while (offset < utf16.size())
@ -395,7 +395,7 @@ static int codepoint_type(uint32_t cp) {
return codepoint_types[cp]; return codepoint_types[cp];
} }
static int codepoint_type(std::string utf8) { static int codepoint_type(const std::string & utf8) {
if (utf8.length() == 0) if (utf8.length() == 0)
return CODEPOINT_TYPE_UNIDENTIFIED; return CODEPOINT_TYPE_UNIDENTIFIED;
size_t offset = 0; size_t offset = 0;