initial import of hpx support

2023-12-23 21:59:34 -05:00 · 2023-12-23 21:59:34 -05:00 · 776f5e29cd
commit 776f5e29cd
parent 799fc22689
3 changed files with 179 additions and 1 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -95,6 +95,7 @@ option(LLAMA_CLBLAST                         "llama: use CLBlast"
 option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT})
 option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"                   OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_HPX                             "llama: use HPX"                                   OFF)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
 option(LLAMA_BUILD_TESTS                     "llama: build tests"    ${LLAMA_STANDALONE})
@ -320,6 +321,10 @@ if (LLAMA_CUBLAS)
    endif()
 endif()
 if(LLAMA_MPI AND LLAMA_HPX)
    message(FATAL "MPI and HPX are not currently compatible together")
 endif()
 if (LLAMA_MPI)
    cmake_minimum_required(VERSION 3.10)
    find_package(MPI)
@ -344,6 +349,17 @@ if (LLAMA_MPI)
    endif()
 endif()
 if (LLAMA_HPX)
    cmake_minimum_required(VERSION 3.10)
    find_package (HPX)
    if (HPX_FOUND)
        add_compile_definitions(GGML_USE_HPX)
        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${HPX_CXXFLAGS})
    else()
        message(FATAL "HPX not found")
    endif()
 endif()
 if (LLAMA_CLBLAST)
    find_package(CLBlast)
    if (CLBlast_FOUND)
@ -727,7 +743,11 @@ add_library(ggml OBJECT
 target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
 target_compile_features(ggml PUBLIC c_std_11) # don't bump
-target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
+if(LLAMA_HPX AND HPX_FOUND)
   target_link_libraries(ggml PUBLIC HPX::hpx ${LLAMA_EXTRA_LIBS})
 else()
   target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
 endif()
 if (GGML_USE_CPU_HBM)
    target_link_libraries(ggml PUBLIC memkind)
 endif()
@ -749,6 +769,12 @@ add_library(llama
 target_include_directories(llama PUBLIC .)
 target_compile_features(llama PUBLIC cxx_std_11) # don't bump
 if(LLAMA_HPX AND HPX_FOUND)
   target_link_libraries(llama PUBLIC HPX::hpx ${LLAMA_EXTRA_LIBS})
   target_compile_options (llama PRIVATE ${HPX_CXXFLAGS})
 endif()
 target_link_libraries(llama PRIVATE
    ggml
    ${LLAMA_EXTRA_LIBS}
--- a/44
+++ b/44
@ -103,7 +103,11 @@ endif
 # keep standard at C11 and C++11
 MK_CPPFLAGS = -I. -Icommon
 MK_CFLAGS   = -std=c11   -fPIC
 ifdef LLAMA_HPX
 MK_CXXFLAGS = -std=c++17 -fPIC
 else
 MK_CXXFLAGS = -std=c++11 -fPIC
 endif
 # -Ofast tends to produce faster code, but may not be available for some compilers.
 ifdef LLAMA_FAST
@ -345,6 +349,46 @@ ifdef LLAMA_MPI
 	OBJS        += ggml-mpi.o
 endif # LLAMA_MPI
 ifdef LLAMA_HPX
 	ifndef HWLOC_FOUND
                HWLOC_PKG:=hwloc
 		HWLOC_REQPKG:=$(shell pkg-config --exists $(HWLOC_PKG) && echo '$(HWLOC_PKG)')
 		ifneq ($(HWLOC_REQPKG),)
 			HWLOC_FOUND:=1
 			HWLOC_CXXFLAGS:=$(shell pkg-config --cflags $(HWLOC_PKG))
 			HWLOC_LDFLAGS:=$(shell pkg-config --libs $(HWLOC_PKG))
 			warn := $(warning hwloc found)
 		else
 			$(warning 'hwloc' not found)
 		endif
 	endif
 	ifndef HWLOC_FOUND
 		$(error hwloc not found)
 	endif
 	ifndef HPX_FOUND
                HPX_PKG:=hpx_component
 		HPX_REQPKG:=$(shell pkg-config --exists $(HPX_PKG) && echo '$(HPX_PKG)')
 		ifneq ($(HPX_REQPKG),)
 			HPX_FOUND:=1
 			HPX_CXXFLAGS:=$(shell pkg-config --cflags hpx_component)
 			HPX_LDFLAGS:=$(shell pkg-config --libs hpx_component)
 			warn := $(warning HPX found)
 		else
 			$(warning 'HPX' not found)
 		endif
 	endif
 	ifndef HPX_FOUND
 		$(error HPX not found)
 	endif
 	MK_CPPFLAGS += -DGGML_USE_HPX $(HWLOC_CXXFLAGS) $(HPX_CXXFLAGS)
 	MK_CXXFLAGS += -Wno-cast-qual $(HWLOC_CXXFLAGS) $(HPX_CXXFLAGS)
 	MK_LDFLAGS  += -Wno-cast-qual $(HWLOC_LDFLAGS) $(HPX_LDFLAGS)
 endif # LLAMA_HPX
 ifdef LLAMA_OPENBLAS
 	MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
 	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas)
--- a/llama.cpp
+++ b/llama.cpp
@ -19,6 +19,12 @@
 #ifdef GGML_USE_MPI
 #  include "ggml-mpi.h"
 #endif
 #ifdef GGML_USE_HPX
 #  include <cstdlib>
 #  include <hpx/hpx_start.hpp>
 #  include <hpx/runtime_local/run_as_hpx_thread.hpp>
 #  include <hpx/execution.hpp>
 #endif
 #ifndef QK_K
 #  ifdef GGML_QKK_64
 #    define QK_K 64
@ -8419,6 +8425,81 @@ struct quantize_state_internal {
        {}
 };
 #if defined(GGML_USE_HPX)
 static void llama_convert_tensor_internal(
    struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<hpx::future<void>> & futures,
    const size_t nelements, const int nthread
 ) {
    if (output.size() < nelements) {
        output.resize(nelements);
    }
    float * f32_output = (float *) output.data();
    ggml_type_traits_t qtype;
    if (ggml_is_quantized(tensor->type)) {
        qtype = ggml_internal_get_type_traits(tensor->type);
        if (qtype.to_float == NULL) {
            throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
        }
    } else if (tensor->type != GGML_TYPE_F16) {
        throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
    }
    if (nthread < 2) {
        if (tensor->type == GGML_TYPE_F16) {
            ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
        } else if (ggml_is_quantized(tensor->type)) {
            qtype.to_float(tensor->data, f32_output, nelements);
        } else {
            GGML_ASSERT(false); // unreachable
        }
        return;
    }
    size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
    size_t block_size_bytes = ggml_type_size(tensor->type);
    GGML_ASSERT(nelements % block_size == 0);
    size_t nblocks = nelements / block_size;
    size_t blocks_per_thread = nblocks / nthread;
    size_t spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
    size_t in_buff_offs = 0;
    size_t out_buff_offs = 0;
     hpx::future<void> fut =
           hpx::run_as_hpx_thread([&futures, nthread, qtype, block_size, block_size_bytes, blocks_per_thread, spare_blocks, &tensor, &in_buff_offs, &f32_output, &out_buff_offs]() -> hpx::future<void>
           {
               for (int tnum = 0; tnum < nthread; tnum++) {
                   size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
                   size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
                   size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
                   auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
                       if (typ == GGML_TYPE_F16) {
                           ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
                       } else {
                           qtype.to_float(inbuf, outbuf, nels);
                       }
                   };
                   futures.push_back(hpx::async(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
                   in_buff_offs += thr_block_bytes;
                   out_buff_offs += thr_elems;
               }
               hpx::wait_all(futures);
               return hpx::make_ready_future<void>();
         });
    fut.wait();
    futures.clear();
 }
 #else
 static void llama_convert_tensor_internal(
    struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
    const size_t nelements, const int nthread
@ -8480,6 +8561,8 @@ static void llama_convert_tensor_internal(
    workers.clear();
 }
 #endif
 static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
    const std::string name = ggml_get_name(tensor);
@ -8687,8 +8770,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
    size_t total_size_new = 0;
    std::vector<int64_t> hist_all(1 << 4, 0);
 #if defined(GGML_USE_HPX)
    {
      std::string thread_arg = "--hpx:threads=" + std::to_string(nthread);
      hpx::init_params params;
      params.cfg = { thread_arg };
      hpx::start(nullptr, 0, nullptr, params);
    }
    std::vector<hpx::future<void>> futures;
    futures.reserve(nthread);
 #else
    std::vector<std::thread> workers;
    workers.reserve(nthread);
 #endif
    std::mutex mutex;
    int idx = 0;
@ -8772,7 +8867,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
                throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
            } else {
 #if defined(GGML_USE_HPX)
                llama_convert_tensor_internal(tensor, f32_conv_buf, futures, nelements, nthread);
 #else
                llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
 #endif
                f32_data = (float *) f32_conv_buf.data();
            }
@ -8813,12 +8912,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                        local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
                    }
                };
 #if defined(GGML_USE_HPX)
                for (int it = 0; it < nthread_use - 1; ++it) {
                    futures.push_back(hpx::async(compute));
                }
                compute();
                hpx::wait_all(futures);
                futures.clear();
 #else
                for (int it = 0; it < nthread_use - 1; ++it) {
                    workers.emplace_back(compute);
                }
                compute();
                for (auto & w : workers) { w.join(); }
                workers.clear();
 #endif
            }
            LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);