From 776f5e29cd7eeec09001d5b9672069fd4a485bc6 Mon Sep 17 00:00:00 2001 From: ct-clmsn Date: Sat, 23 Dec 2023 21:59:34 -0500 Subject: [PATCH 1/6] initial import of hpx support --- CMakeLists.txt | 28 ++++++++++++- Makefile | 44 ++++++++++++++++++++ llama.cpp | 108 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 179 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e3cd43ab3..9e3ce2080 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -95,6 +95,7 @@ option(LLAMA_CLBLAST "llama: use CLBlast" option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT}) option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF) option(LLAMA_MPI "llama: use MPI" OFF) +option(LLAMA_HPX "llama: use HPX" OFF) option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF) option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) @@ -320,6 +321,10 @@ if (LLAMA_CUBLAS) endif() endif() +if(LLAMA_MPI AND LLAMA_HPX) + message(FATAL "MPI and HPX are not currently compatible together") +endif() + if (LLAMA_MPI) cmake_minimum_required(VERSION 3.10) find_package(MPI) @@ -344,6 +349,17 @@ if (LLAMA_MPI) endif() endif() +if (LLAMA_HPX) + cmake_minimum_required(VERSION 3.10) + find_package (HPX) + if (HPX_FOUND) + add_compile_definitions(GGML_USE_HPX) + set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${HPX_CXXFLAGS}) + else() + message(FATAL "HPX not found") + endif() +endif() + if (LLAMA_CLBLAST) find_package(CLBlast) if (CLBlast_FOUND) @@ -727,7 +743,11 @@ add_library(ggml OBJECT target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES}) target_compile_features(ggml PUBLIC c_std_11) # don't bump -target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) +if(LLAMA_HPX AND HPX_FOUND) + target_link_libraries(ggml PUBLIC HPX::hpx ${LLAMA_EXTRA_LIBS}) +else() + target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) +endif() if (GGML_USE_CPU_HBM) target_link_libraries(ggml PUBLIC memkind) endif() @@ -749,6 +769,12 @@ add_library(llama target_include_directories(llama PUBLIC .) target_compile_features(llama PUBLIC cxx_std_11) # don't bump + +if(LLAMA_HPX AND HPX_FOUND) + target_link_libraries(llama PUBLIC HPX::hpx ${LLAMA_EXTRA_LIBS}) + target_compile_options (llama PRIVATE ${HPX_CXXFLAGS}) +endif() + target_link_libraries(llama PRIVATE ggml ${LLAMA_EXTRA_LIBS} diff --git a/Makefile b/Makefile index 8273f8400..15f236d25 100644 --- a/Makefile +++ b/Makefile @@ -103,7 +103,11 @@ endif # keep standard at C11 and C++11 MK_CPPFLAGS = -I. -Icommon MK_CFLAGS = -std=c11 -fPIC +ifdef LLAMA_HPX +MK_CXXFLAGS = -std=c++17 -fPIC +else MK_CXXFLAGS = -std=c++11 -fPIC +endif # -Ofast tends to produce faster code, but may not be available for some compilers. ifdef LLAMA_FAST @@ -345,6 +349,46 @@ ifdef LLAMA_MPI OBJS += ggml-mpi.o endif # LLAMA_MPI +ifdef LLAMA_HPX + ifndef HWLOC_FOUND + HWLOC_PKG:=hwloc + HWLOC_REQPKG:=$(shell pkg-config --exists $(HWLOC_PKG) && echo '$(HWLOC_PKG)') + ifneq ($(HWLOC_REQPKG),) + HWLOC_FOUND:=1 + HWLOC_CXXFLAGS:=$(shell pkg-config --cflags $(HWLOC_PKG)) + HWLOC_LDFLAGS:=$(shell pkg-config --libs $(HWLOC_PKG)) + warn := $(warning hwloc found) + else + $(warning 'hwloc' not found) + endif + endif + + ifndef HWLOC_FOUND + $(error hwloc not found) + endif + + ifndef HPX_FOUND + HPX_PKG:=hpx_component + HPX_REQPKG:=$(shell pkg-config --exists $(HPX_PKG) && echo '$(HPX_PKG)') + ifneq ($(HPX_REQPKG),) + HPX_FOUND:=1 + HPX_CXXFLAGS:=$(shell pkg-config --cflags hpx_component) + HPX_LDFLAGS:=$(shell pkg-config --libs hpx_component) + warn := $(warning HPX found) + else + $(warning 'HPX' not found) + endif + endif + + ifndef HPX_FOUND + $(error HPX not found) + endif + + MK_CPPFLAGS += -DGGML_USE_HPX $(HWLOC_CXXFLAGS) $(HPX_CXXFLAGS) + MK_CXXFLAGS += -Wno-cast-qual $(HWLOC_CXXFLAGS) $(HPX_CXXFLAGS) + MK_LDFLAGS += -Wno-cast-qual $(HWLOC_LDFLAGS) $(HPX_LDFLAGS) + endif # LLAMA_HPX + ifdef LLAMA_OPENBLAS MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas) MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas) diff --git a/llama.cpp b/llama.cpp index edd2910b3..8ad116b8f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -19,6 +19,12 @@ #ifdef GGML_USE_MPI # include "ggml-mpi.h" #endif +#ifdef GGML_USE_HPX +# include +# include +# include +# include +#endif #ifndef QK_K # ifdef GGML_QKK_64 # define QK_K 64 @@ -8419,6 +8425,81 @@ struct quantize_state_internal { {} }; +#if defined(GGML_USE_HPX) + +static void llama_convert_tensor_internal( + struct ggml_tensor * tensor, std::vector> & output, std::vector> & futures, + const size_t nelements, const int nthread +) { + if (output.size() < nelements) { + output.resize(nelements); + } + float * f32_output = (float *) output.data(); + + ggml_type_traits_t qtype; + if (ggml_is_quantized(tensor->type)) { + qtype = ggml_internal_get_type_traits(tensor->type); + if (qtype.to_float == NULL) { + throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type))); + } + } else if (tensor->type != GGML_TYPE_F16) { + throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type))); + } + + if (nthread < 2) { + if (tensor->type == GGML_TYPE_F16) { + ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements); + } else if (ggml_is_quantized(tensor->type)) { + qtype.to_float(tensor->data, f32_output, nelements); + } else { + GGML_ASSERT(false); // unreachable + } + return; + } + + size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type); + size_t block_size_bytes = ggml_type_size(tensor->type); + + GGML_ASSERT(nelements % block_size == 0); + size_t nblocks = nelements / block_size; + size_t blocks_per_thread = nblocks / nthread; + size_t spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count + + size_t in_buff_offs = 0; + size_t out_buff_offs = 0; + + hpx::future fut = + hpx::run_as_hpx_thread([&futures, nthread, qtype, block_size, block_size_bytes, blocks_per_thread, spare_blocks, &tensor, &in_buff_offs, &f32_output, &out_buff_offs]() -> hpx::future + { + for (int tnum = 0; tnum < nthread; tnum++) { + size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread + size_t thr_elems = thr_blocks * block_size; // number of elements for this thread + size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread + + auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) { + if (typ == GGML_TYPE_F16) { + ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels); + } else { + qtype.to_float(inbuf, outbuf, nels); + } + }; + + futures.push_back(hpx::async(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems)); + + in_buff_offs += thr_block_bytes; + out_buff_offs += thr_elems; + } + + hpx::wait_all(futures); + return hpx::make_ready_future(); + }); + + fut.wait(); + futures.clear(); +} + +#else + static void llama_convert_tensor_internal( struct ggml_tensor * tensor, std::vector> & output, std::vector & workers, const size_t nelements, const int nthread @@ -8480,6 +8561,8 @@ static void llama_convert_tensor_internal( workers.clear(); } +#endif + static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) { const std::string name = ggml_get_name(tensor); @@ -8687,8 +8770,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s size_t total_size_new = 0; std::vector hist_all(1 << 4, 0); +#if defined(GGML_USE_HPX) + { + std::string thread_arg = "--hpx:threads=" + std::to_string(nthread); + hpx::init_params params; + params.cfg = { thread_arg }; + hpx::start(nullptr, 0, nullptr, params); + } + + std::vector> futures; + futures.reserve(nthread); +#else std::vector workers; workers.reserve(nthread); +#endif std::mutex mutex; int idx = 0; @@ -8772,7 +8867,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) { throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type))); } else { +#if defined(GGML_USE_HPX) + llama_convert_tensor_internal(tensor, f32_conv_buf, futures, nelements, nthread); +#else llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread); +#endif f32_data = (float *) f32_conv_buf.data(); } @@ -8813,12 +8912,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data()); } }; +#if defined(GGML_USE_HPX) + for (int it = 0; it < nthread_use - 1; ++it) { + futures.push_back(hpx::async(compute)); + } + compute(); + hpx::wait_all(futures); + futures.clear(); +#else for (int it = 0; it < nthread_use - 1; ++it) { workers.emplace_back(compute); } compute(); for (auto & w : workers) { w.join(); } workers.clear(); +#endif } LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); From e1eb3d14f1eed74c6785722282b7db35797b6fb6 Mon Sep 17 00:00:00 2001 From: ct-clmsn Date: Sat, 23 Dec 2023 22:15:52 -0500 Subject: [PATCH 2/6] initial import of hpx support --- README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/README.md b/README.md index 01aef2afc..206894cf1 100644 --- a/README.md +++ b/README.md @@ -335,6 +335,24 @@ Finally, you're ready to run a computation using `mpirun`: mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128 ``` +### HPX Build + +This build has a dependency on the [HPX](https://github.com/STEllAR-GROUP/hpx) asynchronous many task runtime system. Users are encouraged to compile HPX with tcmalloc. HPX provides a user-land thread implementation and a work-stealing thread management implementation. Both features reduce the number of system calls required of the application which can improve performance. HPX emphasizes 'futurization' of applications; users are encouraged to craft dataflow dependency graphs using futures and HPX's implementation of `std::async`. HPX achieves best performance on large workloads. The BLIS BLAS library has support for HPX. The HPX support provided by this build will improve the performance of the BLIS HPX backend when applied to llama.cpp. + + - Using `make`: + - On Linux: + ```bash + make LLAMA_HPX=1 + ``` + + - Using `CMake` on Linux: + ```bash + mkdir build + cd build + CXX= cmake -DHPX_DIR= -DLLAMA_HPX=1 .. + make + ``` + ### BLAS Build Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use: From 1cdfdb34e032c459e75bc8b49edf86fded33ff78 Mon Sep 17 00:00:00 2001 From: ct-clmsn Date: Sat, 23 Dec 2023 22:35:47 -0500 Subject: [PATCH 3/6] fixed hpx runtime initialization and finalization --- llama.cpp | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/llama.cpp b/llama.cpp index 8ad116b8f..eb8178b39 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8771,13 +8771,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s std::vector hist_all(1 << 4, 0); #if defined(GGML_USE_HPX) - { - std::string thread_arg = "--hpx:threads=" + std::to_string(nthread); - hpx::init_params params; - params.cfg = { thread_arg }; - hpx::start(nullptr, 0, nullptr, params); - } - std::vector> futures; futures.reserve(nthread); #else @@ -9352,6 +9345,7 @@ void llama_backend_init(bool numa) { struct ggml_init_params params = { 0, NULL, false }; struct ggml_context * ctx = ggml_init(params); ggml_free(ctx); + } if (numa) { @@ -9361,12 +9355,27 @@ void llama_backend_init(bool numa) { #ifdef GGML_USE_MPI ggml_mpi_backend_init(); #endif +#ifdef GGML_USE_HPX + { + const auto nthread = std::thread::hardware_concurrency(); + std::string thread_arg = "--hpx:threads=" + std::to_string(nthread); + hpx::init_params params; + params.cfg = { thread_arg }; + hpx::start(nullptr, 0, nullptr, params); + } +#endif } void llama_backend_free(void) { #ifdef GGML_USE_MPI ggml_mpi_backend_free(); #endif +#ifdef GGML_USE_HPX + { + hpx::post([]() { hpx::finalize(); }); + hpx::stop(); + } +#endif } int64_t llama_time_us(void) { From beb28d68ab4c1d163e59861f0ae0648c6e90729e Mon Sep 17 00:00:00 2001 From: ct-clmsn Date: Thu, 28 Dec 2023 12:33:58 -0500 Subject: [PATCH 4/6] trying to remove mutex/lock from parallel region --- llama.cpp | 82 +++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 70 insertions(+), 12 deletions(-) diff --git a/llama.cpp b/llama.cpp index eb8178b39..47d79015f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8471,7 +8471,7 @@ static void llama_convert_tensor_internal( hpx::future fut = hpx::run_as_hpx_thread([&futures, nthread, qtype, block_size, block_size_bytes, blocks_per_thread, spare_blocks, &tensor, &in_buff_offs, &f32_output, &out_buff_offs]() -> hpx::future { - for (int tnum = 0; tnum < nthread; tnum++) { + for (int tnum = 1; tnum < nthread; tnum++) { size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread size_t thr_elems = thr_blocks * block_size; // number of elements for this thread size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread @@ -8490,6 +8490,25 @@ static void llama_convert_tensor_internal( out_buff_offs += thr_elems; } + { + size_t thr_blocks = blocks_per_thread + (0 == nthread - 1 ? spare_blocks : 0); // num blocks for this thread + size_t thr_elems = thr_blocks * block_size; // number of elements for this thread + size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread + + auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) { + if (typ == GGML_TYPE_F16) { + ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels); + } else { + qtype.to_float(inbuf, outbuf, nels); + } + }; + + compute(tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems); + + in_buff_offs += thr_block_bytes; + out_buff_offs += thr_elems; + } + hpx::wait_all(futures); return hpx::make_ready_future(); }); @@ -8772,12 +8791,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s #if defined(GGML_USE_HPX) std::vector> futures; - futures.reserve(nthread); + futures.reserve(nthread-1); + hpx::mutex mutex; #else std::vector workers; workers.reserve(nthread); -#endif std::mutex mutex; +#endif int idx = 0; @@ -8875,7 +8895,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s work.resize(nelements * 4); // upper bound on size } new_data = work.data(); - std::array hist_cur = {}; + std::vector hist_cur = {}; static const int chunk_size = 32 * 512; const int nchunk = (nelements + chunk_size - 1)/chunk_size; @@ -8885,6 +8905,51 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } else { size_t counter = 0; new_size = 0; + +#if defined(GGML_USE_HPX) + std::vector> thread_local_hist(nthread_use); + std::vector local_sizes(nthread_use, 0); + std::vector counters(nthread_use, counter); + std::generate(counters.begin(), counters.end(), 0, [chunk_size, n = 0]() mutable { return (++n) * chunk_size; }); + + std::function()> computefn = + [&new_size, new_type, f32_data, new_data, nelements](const std::size_t thread, const std::size_t counter) -> hpx::future { + + auto & local_hist = thread_local_hist[thread]; + std::size_t & local_size = local_sizes[thread]; + std:size_t first = counter; + + while(true) { + first = counter; + if (first >= nelements) { + if (local_size > 0) { + for (int j=0; j(); + } + + hpx::future this_fut = compute(0, counters[0]); + for (int it = 1; it < nthread_use - 1; ++it) { + futures.push_back(hpx::run_as_hpx_thread(compute, it, counters[it])); + } + hpx::wait_all(futures); + this_fut.wait(); + for(auto & local_hist : thread_local_hist) { + for(auto j = 0; j < int(local_hist.size()); ++j) { + hist_cur[j] += local_hist[j]; + } + } + + new_size = std::reduce(local_sizes.begin(), local_sizes.end(), new_size, std::plus{}); + futures.clear(); +#else auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() { std::array local_hist = {}; size_t local_size = 0; @@ -8905,14 +8970,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data()); } }; -#if defined(GGML_USE_HPX) - for (int it = 0; it < nthread_use - 1; ++it) { - futures.push_back(hpx::async(compute)); - } - compute(); - hpx::wait_all(futures); - futures.clear(); -#else + for (int it = 0; it < nthread_use - 1; ++it) { workers.emplace_back(compute); } From 2e3d229597d5553850fc7341786160995312ec90 Mon Sep 17 00:00:00 2001 From: ct-clmsn Date: Thu, 28 Dec 2023 12:50:11 -0500 Subject: [PATCH 5/6] fixed syntax issue --- llama.cpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/llama.cpp b/llama.cpp index 47d79015f..341ec6fc7 100644 --- a/llama.cpp +++ b/llama.cpp @@ -21,6 +21,7 @@ #endif #ifdef GGML_USE_HPX # include +# include # include # include # include @@ -8895,7 +8896,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s work.resize(nelements * 4); // upper bound on size } new_data = work.data(); - std::vector hist_cur = {}; + std::array hist_cur = {}; static const int chunk_size = 32 * 512; const int nchunk = (nelements + chunk_size - 1)/chunk_size; @@ -8910,14 +8911,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s std::vector> thread_local_hist(nthread_use); std::vector local_sizes(nthread_use, 0); std::vector counters(nthread_use, counter); - std::generate(counters.begin(), counters.end(), 0, [chunk_size, n = 0]() mutable { return (++n) * chunk_size; }); + std::generate(counters.begin(), counters.end(), [n = 0]() mutable { return (++n) * (32 * 512); }); - std::function()> computefn = - [&new_size, new_type, f32_data, new_data, nelements](const std::size_t thread, const std::size_t counter) -> hpx::future { + std::function(const std::size_t, const std::size_t, std::vector> &, std::vector &)> computefn = + [new_type, f32_data, new_data, nelements](const std::size_t thread, const std::size_t counter, std::vector> & thread_local_hist, std::vector & local_sizes) -> hpx::future { - auto & local_hist = thread_local_hist[thread]; + std::array & local_hist = thread_local_hist[thread]; std::size_t & local_size = local_sizes[thread]; - std:size_t first = counter; + std::size_t first = counter; while(true) { first = counter; @@ -8933,11 +8934,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data()); } return hpx::make_ready_future(); - } + }; - hpx::future this_fut = compute(0, counters[0]); + hpx::future this_fut = computefn(0, counters[0], thread_local_hist, local_sizes); for (int it = 1; it < nthread_use - 1; ++it) { - futures.push_back(hpx::run_as_hpx_thread(compute, it, counters[it])); + futures.push_back(hpx::run_as_hpx_thread(computefn, it, counters[it], thread_local_hist, local_sizes)); } hpx::wait_all(futures); this_fut.wait(); From c9c4e1f077bcc8f44281eaeca07026a658a6444d Mon Sep 17 00:00:00 2001 From: ct-clmsn Date: Thu, 28 Dec 2023 12:53:43 -0500 Subject: [PATCH 6/6] reorder operations --- llama.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 341ec6fc7..e28354e63 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8936,12 +8936,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s return hpx::make_ready_future(); }; - hpx::future this_fut = computefn(0, counters[0], thread_local_hist, local_sizes); for (int it = 1; it < nthread_use - 1; ++it) { futures.push_back(hpx::run_as_hpx_thread(computefn, it, counters[it], thread_local_hist, local_sizes)); } + + hpx::future this_fut = + computefn(0, counters[0], thread_local_hist, local_sizes); + hpx::wait_all(futures); + this_fut.wait(); + for(auto & local_hist : thread_local_hist) { for(auto j = 0; j < int(local_hist.size()); ++j) { hist_cur[j] += local_hist[j];