Merge c9c4e1f077 into de473f5f8e

2024-01-13 07:16:00 +11:00 · 2024-01-13 07:16:00 +11:00 · 51d3f485cd
commit 51d3f485cd
parent de473f5f8e c9c4e1f077
4 changed files with 270 additions and 1 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -97,6 +97,7 @@ option(LLAMA_METAL                           "llama: use Metal"
 option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"                   OFF)
 option(LLAMA_METAL_SHADER_DEBUG              "llama: compile Metal with -fno-fast-math"         OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
+option(LLAMA_HPX                             "llama: use HPX"                                   OFF)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)

 option(LLAMA_BUILD_TESTS                     "llama: build tests"    ${LLAMA_STANDALONE})
@ -357,6 +358,10 @@ if (LLAMA_CUBLAS)
    endif()
 endif()

+if(LLAMA_MPI AND LLAMA_HPX)
+    message(FATAL "MPI and HPX are not currently compatible together")
+endif()
+
 if (LLAMA_MPI)
    cmake_minimum_required(VERSION 3.10)
    find_package(MPI)
@ -381,6 +386,17 @@ if (LLAMA_MPI)
    endif()
 endif()

+if (LLAMA_HPX)
+    cmake_minimum_required(VERSION 3.10)
+    find_package (HPX)
+    if (HPX_FOUND)
+        add_compile_definitions(GGML_USE_HPX)
+        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${HPX_CXXFLAGS})
+    else()
+        message(FATAL "HPX not found")
+    endif()
+endif()
+
 if (LLAMA_CLBLAST)
    find_package(CLBlast)
    if (CLBlast_FOUND)
@ -767,7 +783,11 @@ add_library(ggml OBJECT

 target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
 target_compile_features(ggml PUBLIC c_std_11) # don't bump
-target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
+if(LLAMA_HPX AND HPX_FOUND)
+   target_link_libraries(ggml PUBLIC HPX::hpx ${LLAMA_EXTRA_LIBS})
+else()
+   target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
+endif()
 if (GGML_USE_CPU_HBM)
    target_link_libraries(ggml PUBLIC memkind)
 endif()
@ -789,6 +809,12 @@ add_library(llama

 target_include_directories(llama PUBLIC .)
 target_compile_features(llama PUBLIC cxx_std_11) # don't bump
+
+if(LLAMA_HPX AND HPX_FOUND)
+   target_link_libraries(llama PUBLIC HPX::hpx ${LLAMA_EXTRA_LIBS})
+   target_compile_options (llama PRIVATE ${HPX_CXXFLAGS})
+endif()
+
 target_link_libraries(llama PRIVATE
    ggml
    ${LLAMA_EXTRA_LIBS}
--- a/44
+++ b/44
@ -103,7 +103,11 @@ endif
 # keep standard at C11 and C++11
 MK_CPPFLAGS = -I. -Icommon
 MK_CFLAGS   = -std=c11   -fPIC
+ifdef LLAMA_HPX
+MK_CXXFLAGS = -std=c++17 -fPIC
+else
 MK_CXXFLAGS = -std=c++11 -fPIC
+endif

 # -Ofast tends to produce faster code, but may not be available for some compilers.
 ifdef LLAMA_FAST
@ -354,6 +358,46 @@ ifdef LLAMA_MPI
 	OBJS        += ggml-mpi.o
 endif # LLAMA_MPI

+ifdef LLAMA_HPX
+ 	ifndef HWLOC_FOUND
+                HWLOC_PKG:=hwloc
+ 		HWLOC_REQPKG:=$(shell pkg-config --exists $(HWLOC_PKG) && echo '$(HWLOC_PKG)')
+ 		ifneq ($(HWLOC_REQPKG),)
+ 			HWLOC_FOUND:=1
+ 			HWLOC_CXXFLAGS:=$(shell pkg-config --cflags $(HWLOC_PKG))
+ 			HWLOC_LDFLAGS:=$(shell pkg-config --libs $(HWLOC_PKG))
+ 			warn := $(warning hwloc found)
+ 		else
+ 			$(warning 'hwloc' not found)
+ 		endif
+ 	endif
+
+ 	ifndef HWLOC_FOUND
+ 		$(error hwloc not found)
+ 	endif
+
+ 	ifndef HPX_FOUND
+                HPX_PKG:=hpx_component
+ 		HPX_REQPKG:=$(shell pkg-config --exists $(HPX_PKG) && echo '$(HPX_PKG)')
+ 		ifneq ($(HPX_REQPKG),)
+ 			HPX_FOUND:=1
+ 			HPX_CXXFLAGS:=$(shell pkg-config --cflags hpx_component)
+ 			HPX_LDFLAGS:=$(shell pkg-config --libs hpx_component)
+ 			warn := $(warning HPX found)
+ 		else
+ 			$(warning 'HPX' not found)
+ 		endif
+ 	endif
+
+ 	ifndef HPX_FOUND
+ 		$(error HPX not found)
+ 	endif
+
+ 	MK_CPPFLAGS += -DGGML_USE_HPX $(HWLOC_CXXFLAGS) $(HPX_CXXFLAGS)
+ 	MK_CXXFLAGS += -Wno-cast-qual $(HWLOC_CXXFLAGS) $(HPX_CXXFLAGS)
+ 	MK_LDFLAGS  += -Wno-cast-qual $(HWLOC_LDFLAGS) $(HPX_LDFLAGS)
+ endif # LLAMA_HPX
+
 ifdef LLAMA_OPENBLAS
 	MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
 	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas)
--- a/README.md
+++ b/README.md
@ -342,6 +342,24 @@ Finally, you're ready to run a computation using `mpirun`:
 mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
 ```

+### HPX Build
+
+This build has a dependency on the [HPX](https://github.com/STEllAR-GROUP/hpx) asynchronous many task runtime system. Users are encouraged to compile HPX with tcmalloc. HPX provides a user-land thread implementation and a work-stealing thread management implementation. Both features reduce the number of system calls required of the application which can improve performance. HPX emphasizes 'futurization' of applications; users are encouraged to craft dataflow dependency graphs using futures and HPX's implementation of `std::async`. HPX achieves best performance on large workloads. The BLIS BLAS library has support for HPX. The HPX support provided by this build will improve the performance of the BLIS HPX backend when applied to llama.cpp.
+
+  - Using `make`:
+    - On Linux:
+      ```bash
+      make LLAMA_HPX=1
+      ```
+
+  - Using `CMake` on Linux:
+      ```bash
+      mkdir build
+      cd build
+      CXX=<C++ compiler used to build HPX> cmake -DHPX_DIR=<PATH_TO_HPX_CMAKE> -DLLAMA_HPX=1 ..
+      make
+      ```
+
 ### BLAS Build

 Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:
--- a/llama.cpp
+++ b/llama.cpp
@ -19,6 +19,13 @@
 #ifdef GGML_USE_MPI
 #  include "ggml-mpi.h"
 #endif
+#ifdef GGML_USE_HPX
+#  include <cstdlib>
+#  include <algorithm>
+#  include <hpx/hpx_start.hpp>
+#  include <hpx/runtime_local/run_as_hpx_thread.hpp>
+#  include <hpx/execution.hpp>
+#endif
 #ifndef QK_K
 #  ifdef GGML_QKK_64
 #    define QK_K 64
@ -8328,6 +8335,100 @@ struct quantize_state_internal {
        {}
 };

+#if defined(GGML_USE_HPX)
+
+static void llama_convert_tensor_internal(
+    struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<hpx::future<void>> & futures,
+    const size_t nelements, const int nthread
+) {
+    if (output.size() < nelements) {
+        output.resize(nelements);
+    }
+    float * f32_output = (float *) output.data();
+
+    ggml_type_traits_t qtype;
+    if (ggml_is_quantized(tensor->type)) {
+        qtype = ggml_internal_get_type_traits(tensor->type);
+        if (qtype.to_float == NULL) {
+            throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
+        }
+    } else if (tensor->type != GGML_TYPE_F16) {
+        throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
+    }
+
+    if (nthread < 2) {
+        if (tensor->type == GGML_TYPE_F16) {
+            ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
+        } else if (ggml_is_quantized(tensor->type)) {
+            qtype.to_float(tensor->data, f32_output, nelements);
+        } else {
+            GGML_ASSERT(false); // unreachable
+        }
+        return;
+    }
+
+    size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
+    size_t block_size_bytes = ggml_type_size(tensor->type);
+
+    GGML_ASSERT(nelements % block_size == 0);
+    size_t nblocks = nelements / block_size;
+    size_t blocks_per_thread = nblocks / nthread;
+    size_t spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
+
+    size_t in_buff_offs = 0;
+    size_t out_buff_offs = 0;
+
+     hpx::future<void> fut =
+           hpx::run_as_hpx_thread([&futures, nthread, qtype, block_size, block_size_bytes, blocks_per_thread, spare_blocks, &tensor, &in_buff_offs, &f32_output, &out_buff_offs]() -> hpx::future<void>
+           {
+               for (int tnum = 1; tnum < nthread; tnum++) {
+                   size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
+                   size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
+                   size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
+
+                   auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
+                       if (typ == GGML_TYPE_F16) {
+                           ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
+                       } else {
+                           qtype.to_float(inbuf, outbuf, nels);
+                       }
+                   };
+
+                   futures.push_back(hpx::async(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
+
+                   in_buff_offs += thr_block_bytes;
+                   out_buff_offs += thr_elems;
+               }
+
+               { 
+                   size_t thr_blocks = blocks_per_thread + (0 == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
+                   size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
+                   size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
+
+                   auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
+                       if (typ == GGML_TYPE_F16) {
+                           ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
+                       } else {
+                           qtype.to_float(inbuf, outbuf, nels);
+                       }
+                   };
+
+                   compute(tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
+
+                   in_buff_offs += thr_block_bytes;
+                   out_buff_offs += thr_elems;
+               }
+
+               hpx::wait_all(futures);
+               return hpx::make_ready_future<void>();
+         });
+
+    fut.wait();
+    futures.clear();
+}
+
+#else
+
 static void llama_convert_tensor_internal(
    struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
    const size_t nelements, const int nthread
@ -8389,6 +8490,8 @@ static void llama_convert_tensor_internal(
    workers.clear();
 }

+#endif
+
 static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
    const std::string name = ggml_get_name(tensor);

@ -8601,9 +8704,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
    size_t total_size_new = 0;
    std::vector<int64_t> hist_all(1 << 4, 0);

+#if defined(GGML_USE_HPX)
+    std::vector<hpx::future<void>> futures;
+    futures.reserve(nthread-1);
+    hpx::mutex mutex;
+#else
    std::vector<std::thread> workers;
    workers.reserve(nthread);
    std::mutex mutex;
+#endif

    int idx = 0;

@ -8686,7 +8795,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
                throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
            } else {
+#if defined(GGML_USE_HPX)
+                llama_convert_tensor_internal(tensor, f32_conv_buf, futures, nelements, nthread);
+#else
                llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
+#endif
                f32_data = (float *) f32_conv_buf.data();
            }

@ -8707,6 +8820,56 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            } else {
                size_t counter = 0;
                new_size = 0;
+
+#if defined(GGML_USE_HPX)
+                std::vector<std::array<int64_t, 1 << 4>> thread_local_hist(nthread_use);
+                std::vector<std::size_t> local_sizes(nthread_use, 0);
+                std::vector<std::size_t> counters(nthread_use, counter);
+                std::generate(counters.begin(), counters.end(), [n = 0]() mutable { return (++n) * (32 * 512); });
+
+                std::function<hpx::future<void>(const std::size_t, const std::size_t, std::vector<std::array<int64_t, 1 << 4>> &, std::vector<std::size_t> &)> computefn =
+                   [new_type, f32_data, new_data, nelements](const std::size_t thread, const std::size_t counter, std::vector<std::array<int64_t, 1 << 4>> & thread_local_hist, std::vector<std::size_t> & local_sizes) -> hpx::future<void> {
+
+                   std::array<int64_t, 1 << 4> & local_hist = thread_local_hist[thread];
+                   std::size_t & local_size = local_sizes[thread];
+                   std::size_t first = counter;
+
+                   while(true) {
+                        first = counter;
+                        if (first >= nelements) {
+                            if (local_size > 0) {
+                                for (int j=0; j<int(local_hist.size()); ++j) {
+                                    local_hist[j] += local_hist[j];
+                                }
+                            }
+                            break;
+                        }
+                        size_t last = std::min(nelements, first + chunk_size);
+                        local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
+                   } 
+                   return hpx::make_ready_future<void>();
+                };
+
+                for (int it = 1; it < nthread_use - 1; ++it) {
+                    futures.push_back(hpx::run_as_hpx_thread(computefn, it, counters[it], thread_local_hist, local_sizes));
+                }
+
+                hpx::future<void> this_fut =
+                    computefn(0, counters[0], thread_local_hist, local_sizes);
+
+                hpx::wait_all(futures);
+
+                this_fut.wait();
+
+                for(auto & local_hist : thread_local_hist) {
+                    for(auto j = 0; j < int(local_hist.size()); ++j) {
+                        hist_cur[j] += local_hist[j];
+                    }
+                }
+
+                new_size = std::reduce(local_sizes.begin(), local_sizes.end(), new_size, std::plus<std::size_t>{});
+                futures.clear();
+#else
                auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() {
                    std::array<int64_t, 1 << 4> local_hist = {};
                    size_t local_size = 0;
@ -8727,12 +8890,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                        local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
                    }
                };
+
                for (int it = 0; it < nthread_use - 1; ++it) {
                    workers.emplace_back(compute);
                }
                compute();
                for (auto & w : workers) { w.join(); }
                workers.clear();
+#endif
            }

            LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
@ -9160,6 +9325,7 @@ void llama_backend_init(bool numa) {
        struct ggml_init_params params = { 0, NULL, false };
        struct ggml_context * ctx = ggml_init(params);
        ggml_free(ctx);
+
    }

    if (numa) {
@ -9169,12 +9335,27 @@ void llama_backend_init(bool numa) {
 #ifdef GGML_USE_MPI
    ggml_mpi_backend_init();
 #endif
+#ifdef GGML_USE_HPX
+    {
+        const auto nthread = std::thread::hardware_concurrency();
+        std::string thread_arg = "--hpx:threads=" + std::to_string(nthread);
+        hpx::init_params params;
+        params.cfg = { thread_arg };
+        hpx::start(nullptr, 0, nullptr, params);
+    }
+#endif
 }

 void llama_backend_free(void) {
 #ifdef GGML_USE_MPI
    ggml_mpi_backend_free();
 #endif
+#ifdef GGML_USE_HPX
+    {
+        hpx::post([]() { hpx::finalize(); });
+        hpx::stop();
+    }
+#endif
 }

 int64_t llama_time_us(void) {