From 776f5e29cd7eeec09001d5b9672069fd4a485bc6 Mon Sep 17 00:00:00 2001
From: ct-clmsn <ct.clmsn@gmail.com>
Date: Sat, 23 Dec 2023 21:59:34 -0500
Subject: [PATCH] initial import of hpx support

---
 CMakeLists.txt |  28 ++++++++++++-
 Makefile       |  44 ++++++++++++++++++++
 llama.cpp      | 108 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 179 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e3cd43ab3..9e3ce2080 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -95,6 +95,7 @@ option(LLAMA_CLBLAST                         "llama: use CLBlast"
 option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT})
 option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"                   OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
+option(LLAMA_HPX                             "llama: use HPX"                                   OFF)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
 
 option(LLAMA_BUILD_TESTS                     "llama: build tests"    ${LLAMA_STANDALONE})
@@ -320,6 +321,10 @@ if (LLAMA_CUBLAS)
     endif()
 endif()
 
+if(LLAMA_MPI AND LLAMA_HPX)
+    message(FATAL "MPI and HPX are not currently compatible together")
+endif()
+
 if (LLAMA_MPI)
     cmake_minimum_required(VERSION 3.10)
     find_package(MPI)
@@ -344,6 +349,17 @@ if (LLAMA_MPI)
     endif()
 endif()
 
+if (LLAMA_HPX)
+    cmake_minimum_required(VERSION 3.10)
+    find_package (HPX)
+    if (HPX_FOUND)
+        add_compile_definitions(GGML_USE_HPX)
+        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${HPX_CXXFLAGS})
+    else()
+        message(FATAL "HPX not found")
+    endif()
+endif()
+
 if (LLAMA_CLBLAST)
     find_package(CLBlast)
     if (CLBlast_FOUND)
@@ -727,7 +743,11 @@ add_library(ggml OBJECT
 
 target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
 target_compile_features(ggml PUBLIC c_std_11) # don't bump
-target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
+if(LLAMA_HPX AND HPX_FOUND)
+   target_link_libraries(ggml PUBLIC HPX::hpx ${LLAMA_EXTRA_LIBS})
+else()
+   target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
+endif()
 if (GGML_USE_CPU_HBM)
     target_link_libraries(ggml PUBLIC memkind)
 endif()
@@ -749,6 +769,12 @@ add_library(llama
 
 target_include_directories(llama PUBLIC .)
 target_compile_features(llama PUBLIC cxx_std_11) # don't bump
+
+if(LLAMA_HPX AND HPX_FOUND)
+   target_link_libraries(llama PUBLIC HPX::hpx ${LLAMA_EXTRA_LIBS})
+   target_compile_options (llama PRIVATE ${HPX_CXXFLAGS})
+endif()
+
 target_link_libraries(llama PRIVATE
     ggml
     ${LLAMA_EXTRA_LIBS}
diff --git a/Makefile b/Makefile
index 8273f8400..15f236d25 100644
--- a/Makefile
+++ b/Makefile
@@ -103,7 +103,11 @@ endif
 # keep standard at C11 and C++11
 MK_CPPFLAGS = -I. -Icommon
 MK_CFLAGS   = -std=c11   -fPIC
+ifdef LLAMA_HPX
+MK_CXXFLAGS = -std=c++17 -fPIC
+else
 MK_CXXFLAGS = -std=c++11 -fPIC
+endif
 
 # -Ofast tends to produce faster code, but may not be available for some compilers.
 ifdef LLAMA_FAST
@@ -345,6 +349,46 @@ ifdef LLAMA_MPI
 	OBJS        += ggml-mpi.o
 endif # LLAMA_MPI
 
+ifdef LLAMA_HPX
+ 	ifndef HWLOC_FOUND
+                HWLOC_PKG:=hwloc
+ 		HWLOC_REQPKG:=$(shell pkg-config --exists $(HWLOC_PKG) && echo '$(HWLOC_PKG)')
+ 		ifneq ($(HWLOC_REQPKG),)
+ 			HWLOC_FOUND:=1
+ 			HWLOC_CXXFLAGS:=$(shell pkg-config --cflags $(HWLOC_PKG))
+ 			HWLOC_LDFLAGS:=$(shell pkg-config --libs $(HWLOC_PKG))
+ 			warn := $(warning hwloc found)
+ 		else
+ 			$(warning 'hwloc' not found)
+ 		endif
+ 	endif
+
+ 	ifndef HWLOC_FOUND
+ 		$(error hwloc not found)
+ 	endif
+
+ 	ifndef HPX_FOUND
+                HPX_PKG:=hpx_component
+ 		HPX_REQPKG:=$(shell pkg-config --exists $(HPX_PKG) && echo '$(HPX_PKG)')
+ 		ifneq ($(HPX_REQPKG),)
+ 			HPX_FOUND:=1
+ 			HPX_CXXFLAGS:=$(shell pkg-config --cflags hpx_component)
+ 			HPX_LDFLAGS:=$(shell pkg-config --libs hpx_component)
+ 			warn := $(warning HPX found)
+ 		else
+ 			$(warning 'HPX' not found)
+ 		endif
+ 	endif
+
+ 	ifndef HPX_FOUND
+ 		$(error HPX not found)
+ 	endif
+
+ 	MK_CPPFLAGS += -DGGML_USE_HPX $(HWLOC_CXXFLAGS) $(HPX_CXXFLAGS)
+ 	MK_CXXFLAGS += -Wno-cast-qual $(HWLOC_CXXFLAGS) $(HPX_CXXFLAGS)
+ 	MK_LDFLAGS  += -Wno-cast-qual $(HWLOC_LDFLAGS) $(HPX_LDFLAGS)
+ endif # LLAMA_HPX
+
 ifdef LLAMA_OPENBLAS
 	MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
 	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas)
diff --git a/llama.cpp b/llama.cpp
index edd2910b3..8ad116b8f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -19,6 +19,12 @@
 #ifdef GGML_USE_MPI
 #  include "ggml-mpi.h"
 #endif
+#ifdef GGML_USE_HPX
+#  include <cstdlib>
+#  include <hpx/hpx_start.hpp>
+#  include <hpx/runtime_local/run_as_hpx_thread.hpp>
+#  include <hpx/execution.hpp>
+#endif
 #ifndef QK_K
 #  ifdef GGML_QKK_64
 #    define QK_K 64
@@ -8419,6 +8425,81 @@ struct quantize_state_internal {
         {}
 };
 
+#if defined(GGML_USE_HPX)
+
+static void llama_convert_tensor_internal(
+    struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<hpx::future<void>> & futures,
+    const size_t nelements, const int nthread
+) {
+    if (output.size() < nelements) {
+        output.resize(nelements);
+    }
+    float * f32_output = (float *) output.data();
+
+    ggml_type_traits_t qtype;
+    if (ggml_is_quantized(tensor->type)) {
+        qtype = ggml_internal_get_type_traits(tensor->type);
+        if (qtype.to_float == NULL) {
+            throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
+        }
+    } else if (tensor->type != GGML_TYPE_F16) {
+        throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
+    }
+
+    if (nthread < 2) {
+        if (tensor->type == GGML_TYPE_F16) {
+            ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
+        } else if (ggml_is_quantized(tensor->type)) {
+            qtype.to_float(tensor->data, f32_output, nelements);
+        } else {
+            GGML_ASSERT(false); // unreachable
+        }
+        return;
+    }
+
+    size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
+    size_t block_size_bytes = ggml_type_size(tensor->type);
+
+    GGML_ASSERT(nelements % block_size == 0);
+    size_t nblocks = nelements / block_size;
+    size_t blocks_per_thread = nblocks / nthread;
+    size_t spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
+
+    size_t in_buff_offs = 0;
+    size_t out_buff_offs = 0;
+
+     hpx::future<void> fut =
+           hpx::run_as_hpx_thread([&futures, nthread, qtype, block_size, block_size_bytes, blocks_per_thread, spare_blocks, &tensor, &in_buff_offs, &f32_output, &out_buff_offs]() -> hpx::future<void>
+           {
+               for (int tnum = 0; tnum < nthread; tnum++) {
+                   size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
+                   size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
+                   size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
+
+                   auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
+                       if (typ == GGML_TYPE_F16) {
+                           ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
+                       } else {
+                           qtype.to_float(inbuf, outbuf, nels);
+                       }
+                   };
+
+                   futures.push_back(hpx::async(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
+
+                   in_buff_offs += thr_block_bytes;
+                   out_buff_offs += thr_elems;
+               }
+
+               hpx::wait_all(futures);
+               return hpx::make_ready_future<void>();
+         });
+
+    fut.wait();
+    futures.clear();
+}
+
+#else
+
 static void llama_convert_tensor_internal(
     struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
     const size_t nelements, const int nthread
@@ -8480,6 +8561,8 @@ static void llama_convert_tensor_internal(
     workers.clear();
 }
 
+#endif
+
 static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
     const std::string name = ggml_get_name(tensor);
 
@@ -8687,8 +8770,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     size_t total_size_new = 0;
     std::vector<int64_t> hist_all(1 << 4, 0);
 
+#if defined(GGML_USE_HPX)
+    {
+      std::string thread_arg = "--hpx:threads=" + std::to_string(nthread);
+      hpx::init_params params;
+      params.cfg = { thread_arg };
+      hpx::start(nullptr, 0, nullptr, params);
+    }
+
+    std::vector<hpx::future<void>> futures;
+    futures.reserve(nthread);
+#else
     std::vector<std::thread> workers;
     workers.reserve(nthread);
+#endif
     std::mutex mutex;
 
     int idx = 0;
@@ -8772,7 +8867,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
                 throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
             } else {
+#if defined(GGML_USE_HPX)
+                llama_convert_tensor_internal(tensor, f32_conv_buf, futures, nelements, nthread);
+#else
                 llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
+#endif
                 f32_data = (float *) f32_conv_buf.data();
             }
 
@@ -8813,12 +8912,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                         local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
                     }
                 };
+#if defined(GGML_USE_HPX)
+                for (int it = 0; it < nthread_use - 1; ++it) {
+                    futures.push_back(hpx::async(compute));
+                }
+                compute();
+                hpx::wait_all(futures);
+                futures.clear();
+#else
                 for (int it = 0; it < nthread_use - 1; ++it) {
                     workers.emplace_back(compute);
                 }
                 compute();
                 for (auto & w : workers) { w.join(); }
                 workers.clear();
+#endif
             }
 
             LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);