From 14fa3d108b4f031c0a74d04b56cf951014fdef83 Mon Sep 17 00:00:00 2001
From: John Doe <john.doe@example.com>
Date: Mon, 1 May 2023 06:52:01 -0400
Subject: [PATCH] Handle C++ libraries without threading support.

---
 examples/common.cpp                |  4 ++++
 examples/common.h                  |  3 +++
 examples/embedding/embedding.cpp   |  2 +-
 examples/main/main.cpp             |  2 +-
 examples/perplexity/perplexity.cpp |  2 +-
 llama.cpp                          | 36 ++++++++++++++++++++++--------
 6 files changed, 37 insertions(+), 12 deletions(-)
diff --git a/examples/common.cpp b/examples/common.cpp
index ad7b0bba3..89834c58f 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -62,8 +62,12 @@ int32_t get_num_physical_cores() {
 #elif defined(_WIN32)
     //TODO: Implement
 #endif
+#if __STDCPP_THREADS__ || _GLIBCXX_HAS_GTHREADS
     unsigned int n_threads = std::thread::hardware_concurrency();
     return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
+#else
+    return 1;
+#endif
 }
 
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
diff --git a/examples/common.h b/examples/common.h
index 627696e30..32ba0740b 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -7,8 +7,11 @@
 #include <string>
 #include <vector>
 #include <random>
+#if __STDCPP_THREADS__ || _GLIBCXX_HAS_GTHREADS
 #include <thread>
+#endif
 #include <unordered_map>
+#include <chrono>
 
 //
 // CLI argument parsing
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index b3e001476..d36b948c6 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -59,7 +59,7 @@ int main(int argc, char ** argv) {
     {
         fprintf(stderr, "\n");
         fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
-                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
+                params.n_threads, get_num_physical_cores(), llama_print_system_info());
     }
 
     int n_past = 0;
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 7dc100512..140d1f846 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -135,7 +135,7 @@ int main(int argc, char ** argv) {
     {
         fprintf(stderr, "\n");
         fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
-                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
+                params.n_threads, get_num_physical_cores(), llama_print_system_info());
     }
 
     // determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 2ca338835..d566a3699 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -158,7 +158,7 @@ int main(int argc, char ** argv) {
     {
         fprintf(stderr, "\n");
         fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
-                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
+                params.n_threads, get_num_physical_cores(), llama_print_system_info());
     }
 
     perplexity(ctx, params);
diff --git a/llama.cpp b/llama.cpp
index 868a58a8b..e8e532d04 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -24,9 +24,13 @@
 #include <memory>
 #include <algorithm>
 #include <initializer_list>
+#if __STDCPP_THREADS__ || _GLIBCXX_HAS_GTHREADS
 #include <thread>
 #include <atomic>
 #include <mutex>
+#else
+#warning "C++ standard library is configured for single threading."
+#endif
 #include <sstream>
 #include <numeric>
 
@@ -1889,7 +1893,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     };
 
     if (nthread <= 0) {
+#if __STDCPP_THREADS__ || _GLIBCXX_HAS_GTHREADS
         nthread = std::thread::hardware_concurrency();
+#else
+        nthread = 1;
+#endif
     }
 
     std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
@@ -1900,8 +1908,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     size_t total_size_new = 0;
     std::vector<int64_t> hist_all(1 << 4, 0);
 
+#if __STDCPP_THREADS__ || _GLIBCXX_HAS_GTHREADS
     std::vector<std::thread> workers;
     std::mutex mutex;
+#endif
 
     size_t idx = 0;
     for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
@@ -1969,29 +1979,37 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             } else {
                 size_t counter = 0;
                 new_size = 0;
-                auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
+                auto compute = [&, new_type, f32_data, new_data, nelements, chunk_size] () {
                     std::vector<int64_t> local_hist;
                     size_t local_size = 0;
                     while (true) {
-                        std::unique_lock<std::mutex> lock(mutex);
-                        size_t first = counter; counter += chunk_size;
-                        if (first >= nelements) {
-                            if (!local_hist.empty()) {
-                                for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
-                                new_size += local_size;
+                        size_t first;
+                        {
+#if __STDCPP_THREADS__ || _GLIBCXX_HAS_GTHREADS
+                            std::unique_lock<std::mutex> lock(mutex);
+#endif
+                            first = counter; counter += chunk_size;
+                            if (first >= nelements) {
+                                if (!local_hist.empty()) {
+                                    for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
+                                    new_size += local_size;
+                                }
+                                break;
                             }
-                            break;
                         }
-                        lock.unlock();
                         size_t last = std::min(nelements, first + chunk_size);
                         if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
                         local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
                     }
                 };
+#if __STDCPP_THREADS__ || _GLIBCXX_HAS_GTHREADS
                 if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
                 for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
                 compute();
                 for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
+#else
+                compute();
+#endif
             }
 
             printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);