Handle C++ libraries without threading support.

This commit is contained in:
John Doe 2023-05-01 06:52:01 -04:00
parent f4cef87edf
commit 14fa3d108b
6 changed files with 37 additions and 12 deletions

View file

@ -62,8 +62,12 @@ int32_t get_num_physical_cores() {
#elif defined(_WIN32) #elif defined(_WIN32)
//TODO: Implement //TODO: Implement
#endif #endif
#if __STDCPP_THREADS__ || _GLIBCXX_HAS_GTHREADS
unsigned int n_threads = std::thread::hardware_concurrency(); unsigned int n_threads = std::thread::hardware_concurrency();
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4; return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
#else
return 1;
#endif
} }
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {

View file

@ -7,8 +7,11 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include <random> #include <random>
#if __STDCPP_THREADS__ || _GLIBCXX_HAS_GTHREADS
#include <thread> #include <thread>
#endif
#include <unordered_map> #include <unordered_map>
#include <chrono>
// //
// CLI argument parsing // CLI argument parsing

View file

@ -59,7 +59,7 @@ int main(int argc, char ** argv) {
{ {
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); params.n_threads, get_num_physical_cores(), llama_print_system_info());
} }
int n_past = 0; int n_past = 0;

View file

@ -135,7 +135,7 @@ int main(int argc, char ** argv) {
{ {
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); params.n_threads, get_num_physical_cores(), llama_print_system_info());
} }
// determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters // determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters

View file

@ -158,7 +158,7 @@ int main(int argc, char ** argv) {
{ {
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); params.n_threads, get_num_physical_cores(), llama_print_system_info());
} }
perplexity(ctx, params); perplexity(ctx, params);

View file

@ -24,9 +24,13 @@
#include <memory> #include <memory>
#include <algorithm> #include <algorithm>
#include <initializer_list> #include <initializer_list>
#if __STDCPP_THREADS__ || _GLIBCXX_HAS_GTHREADS
#include <thread> #include <thread>
#include <atomic> #include <atomic>
#include <mutex> #include <mutex>
#else
#warning "C++ standard library is configured for single threading."
#endif
#include <sstream> #include <sstream>
#include <numeric> #include <numeric>
@ -1889,7 +1893,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
}; };
if (nthread <= 0) { if (nthread <= 0) {
#if __STDCPP_THREADS__ || _GLIBCXX_HAS_GTHREADS
nthread = std::thread::hardware_concurrency(); nthread = std::thread::hardware_concurrency();
#else
nthread = 1;
#endif
} }
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false, std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
@ -1900,8 +1908,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
size_t total_size_new = 0; size_t total_size_new = 0;
std::vector<int64_t> hist_all(1 << 4, 0); std::vector<int64_t> hist_all(1 << 4, 0);
#if __STDCPP_THREADS__ || _GLIBCXX_HAS_GTHREADS
std::vector<std::thread> workers; std::vector<std::thread> workers;
std::mutex mutex; std::mutex mutex;
#endif
size_t idx = 0; size_t idx = 0;
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) { for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
@ -1969,29 +1979,37 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
} else { } else {
size_t counter = 0; size_t counter = 0;
new_size = 0; new_size = 0;
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () { auto compute = [&, new_type, f32_data, new_data, nelements, chunk_size] () {
std::vector<int64_t> local_hist; std::vector<int64_t> local_hist;
size_t local_size = 0; size_t local_size = 0;
while (true) { while (true) {
std::unique_lock<std::mutex> lock(mutex); size_t first;
size_t first = counter; counter += chunk_size; {
if (first >= nelements) { #if __STDCPP_THREADS__ || _GLIBCXX_HAS_GTHREADS
if (!local_hist.empty()) { std::unique_lock<std::mutex> lock(mutex);
for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j]; #endif
new_size += local_size; first = counter; counter += chunk_size;
if (first >= nelements) {
if (!local_hist.empty()) {
for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
new_size += local_size;
}
break;
} }
break;
} }
lock.unlock();
size_t last = std::min(nelements, first + chunk_size); size_t last = std::min(nelements, first + chunk_size);
if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0); if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data()); local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
} }
}; };
#if __STDCPP_THREADS__ || _GLIBCXX_HAS_GTHREADS
if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1); if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute); for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
compute(); compute();
for (int it = 0; it < nthread_use - 1; ++it) workers[it].join(); for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
#else
compute();
#endif
} }
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0); printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);