From 961e98f24549ef58f5f172bfe2edb92ac0f2fd8f Mon Sep 17 00:00:00 2001
From: Douglas Hanley <thesecretaryofwar@gmail.com>
Date: Fri, 9 Feb 2024 11:53:17 -0600
Subject: [PATCH] style fixes

---
 llama.cpp | 63 +++++++++++++++++++++----------------------------------
 1 file changed, 24 insertions(+), 39 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 35770a1ab..e4498c704 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8175,7 +8175,9 @@ struct llm_tokenizer_wpm {
         // find the longest tokens that form the words
         for (const std::string &word : words) {
             // skip empty words
-            if (word.size() == 0) continue;
+            if (word.size() == 0) {
+                continue;
+            }
 
             // prepend phantom space
             std::string word1 = "\xe2\x96\x81" + word;
@@ -8201,7 +8203,9 @@ struct llm_tokenizer_wpm {
                 }
 
                 // must be an unknown character
-                if (!match) i++;
+                if (!match) {
+                    i++;
+                }
             }
 
             // we didn't find any matches for this word
@@ -8215,8 +8219,7 @@ struct llm_tokenizer_wpm {
     }
 
     std::vector<std::string> preprocess(const std::string & text) {
-        std::string ori_str = text;
-        ori_str = normalize(ori_str);
+        std::string ori_str = normalize(text);
         uint64_t ori_size = ori_str.size();
 
         // single punct / single symbol / single digit
@@ -8267,8 +8270,7 @@ struct llm_tokenizer_wpm {
     std::string normalize(const std::string &text) {
         // TODO: handle chinese characters? https://github.com/huggingface/tokenizers/blob/ef5f50605ddf9f8caef1598c0e4853862b9707a7/tokenizers/src/normalizers/bert.rs#L98
         std::string text2 = strip_accents(text);
-        for (size_t i = 0; i < text2.size(); i += utf8_len(text2[i]))
-        {
+        for (size_t i = 0; i < text2.size(); i += utf8_len(text2[i])) {
             char c = text2[i];
             if (c >= 'A' && c <= 'Z')
                 text2[i] = c - 'A' + 'a';
@@ -8331,20 +8333,16 @@ struct llm_tokenizer_wpm {
             {"Ô", 'O'}, {"Õ", 'O'}, {"Ö", 'O'}, {"ò", 'o'}, {"ó", 'o'}, {"ô", 'o'},
             {"õ", 'o'}, {"ö", 'o'}, {"Ù", 'U'}, {"Ú", 'U'}, {"Û", 'U'}, {"Ü", 'U'},
             {"ù", 'u'}, {"ú", 'u'}, {"û", 'u'}, {"ü", 'u'}, {"Ý", 'Y'}, {"ý", 'y'},
-            {"Ç", 'C'}, {"ç", 'c'}, {"Ñ", 'N'},{ "ñ", 'n'},
+            {"Ç", 'C'}, {"ç", 'c'}, {"Ñ", 'N'}, {"ñ", 'n'},
         };
 
-        for (size_t i = 0; i < inputString.length();)
-        {
+        for (size_t i = 0; i < inputString.length();) {
             int len = utf8_len(inputString[i]);
             std::string curChar = inputString.substr(i, len);
             auto iter = accentMap.find(curChar);
-            if (iter != accentMap.end())
-            {
+            if (iter != accentMap.end()) {
                 resultString += iter->second;
-            }
-            else
-            {
+            } else {
                 resultString += curChar;
             }
             i += len;
@@ -8362,12 +8360,12 @@ struct llm_tokenizer_wpm {
     const llama_vocab & vocab;
 };
 
-typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{
+typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
     FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
     FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
 } FRAGMENT_BUFFER_VARIANT_TYPE;
 
-struct fragment_buffer_variant{
+struct fragment_buffer_variant {
     fragment_buffer_variant(llama_vocab::id _token)
     :
         type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
@@ -8397,8 +8395,7 @@ struct fragment_buffer_variant{
 
 // #define PRETOKENIZERDEBUG
 
-static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer)
-{
+static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
     // for each special token
     for (const auto & st: vocab.special_tokens_cache) {
         const auto & special_token = st.first;
@@ -8516,10 +8513,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
     switch (vocab.type) {
         case LLAMA_VOCAB_TYPE_SPM:
             {
-                for (const auto & fragment: fragment_buffer)
-                {
-                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
-                    {
+                for (const auto & fragment: fragment_buffer) {
+                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
                         // without adding this leading whitespace, we do not get the same results as the original tokenizer
 
                         // TODO: It's likely possible to get rid of this string copy entirely
@@ -8539,19 +8534,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
                         llm_tokenizer_spm tokenizer(vocab);
                         llama_escape_whitespace(raw_text);
                         tokenizer.tokenize(raw_text, output);
-                    }
-                    else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
-                    {
+                    } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
                         output.push_back(fragment.token);
                     }
                 }
             } break;
         case LLAMA_VOCAB_TYPE_BPE:
             {
-                for (const auto & fragment: fragment_buffer)
-                {
-                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
-                    {
+                for (const auto & fragment: fragment_buffer) {
+                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
                         auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
 
 #ifdef PRETOKENIZERDEBUG
@@ -8559,19 +8550,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
 #endif
                         llm_tokenizer_bpe tokenizer(vocab);
                         tokenizer.tokenize(raw_text, output);
-                    }
-                    else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
-                    {
+                    } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
                         output.push_back(fragment.token);
                     }
                 }
             } break;
         case LLAMA_VOCAB_TYPE_WPM:
             {
-                for (const auto & fragment: fragment_buffer)
-                {
-                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
-                    {
+                for (const auto & fragment: fragment_buffer) {
+                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
                         auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
 
 #ifdef PRETOKENIZERDEBUG
@@ -8579,9 +8566,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
 #endif
                         llm_tokenizer_wpm tokenizer(vocab);
                         tokenizer.tokenize(raw_text, output);
-                    }
-                    else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
-                    {
+                    } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
                         output.push_back(fragment.token);
                     }
                 }