From 14fa3d108b4f031c0a74d04b56cf951014fdef83 Mon Sep 17 00:00:00 2001 From: John Doe Date: Mon, 1 May 2023 06:52:01 -0400 Subject: [PATCH] Handle C++ libraries without threading support. --- examples/common.cpp | 4 ++++ examples/common.h | 3 +++ examples/embedding/embedding.cpp | 2 +- examples/main/main.cpp | 2 +- examples/perplexity/perplexity.cpp | 2 +- llama.cpp | 36 ++++++++++++++++++++++-------- 6 files changed, 37 insertions(+), 12 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index ad7b0bba3..89834c58f 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -62,8 +62,12 @@ int32_t get_num_physical_cores() { #elif defined(_WIN32) //TODO: Implement #endif +#if __STDCPP_THREADS__ || _GLIBCXX_HAS_GTHREADS unsigned int n_threads = std::thread::hardware_concurrency(); return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4; +#else + return 1; +#endif } bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { diff --git a/examples/common.h b/examples/common.h index 627696e30..32ba0740b 100644 --- a/examples/common.h +++ b/examples/common.h @@ -7,8 +7,11 @@ #include #include #include +#if __STDCPP_THREADS__ || _GLIBCXX_HAS_GTHREADS #include +#endif #include +#include // // CLI argument parsing diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index b3e001476..d36b948c6 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -59,7 +59,7 @@ int main(int argc, char ** argv) { { fprintf(stderr, "\n"); fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", - params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); + params.n_threads, get_num_physical_cores(), llama_print_system_info()); } int n_past = 0; diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 7dc100512..140d1f846 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -135,7 +135,7 @@ int main(int argc, char ** argv) { { fprintf(stderr, "\n"); fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", - params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); + params.n_threads, get_num_physical_cores(), llama_print_system_info()); } // determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 2ca338835..d566a3699 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -158,7 +158,7 @@ int main(int argc, char ** argv) { { fprintf(stderr, "\n"); fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", - params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); + params.n_threads, get_num_physical_cores(), llama_print_system_info()); } perplexity(ctx, params); diff --git a/llama.cpp b/llama.cpp index 868a58a8b..e8e532d04 100644 --- a/llama.cpp +++ b/llama.cpp @@ -24,9 +24,13 @@ #include #include #include +#if __STDCPP_THREADS__ || _GLIBCXX_HAS_GTHREADS #include #include #include +#else +#warning "C++ standard library is configured for single threading." +#endif #include #include @@ -1889,7 +1893,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s }; if (nthread <= 0) { +#if __STDCPP_THREADS__ || _GLIBCXX_HAS_GTHREADS nthread = std::thread::hardware_concurrency(); +#else + nthread = 1; +#endif } std::unique_ptr model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false, @@ -1900,8 +1908,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s size_t total_size_new = 0; std::vector hist_all(1 << 4, 0); +#if __STDCPP_THREADS__ || _GLIBCXX_HAS_GTHREADS std::vector workers; std::mutex mutex; +#endif size_t idx = 0; for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) { @@ -1969,29 +1979,37 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } else { size_t counter = 0; new_size = 0; - auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () { + auto compute = [&, new_type, f32_data, new_data, nelements, chunk_size] () { std::vector local_hist; size_t local_size = 0; while (true) { - std::unique_lock lock(mutex); - size_t first = counter; counter += chunk_size; - if (first >= nelements) { - if (!local_hist.empty()) { - for (int j=0; j lock(mutex); +#endif + first = counter; counter += chunk_size; + if (first >= nelements) { + if (!local_hist.empty()) { + for (int j=0; j %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);