This commit is contained in:
ct-clmsn 2024-01-13 07:16:00 +11:00 committed by GitHub
commit 51d3f485cd
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 270 additions and 1 deletions

View file

@ -97,6 +97,7 @@ option(LLAMA_METAL "llama: use Metal"
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF)
option(LLAMA_MPI "llama: use MPI" OFF)
option(LLAMA_HPX "llama: use HPX" OFF)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
@ -357,6 +358,10 @@ if (LLAMA_CUBLAS)
endif()
endif()
if(LLAMA_MPI AND LLAMA_HPX)
message(FATAL "MPI and HPX are not currently compatible together")
endif()
if (LLAMA_MPI)
cmake_minimum_required(VERSION 3.10)
find_package(MPI)
@ -381,6 +386,17 @@ if (LLAMA_MPI)
endif()
endif()
if (LLAMA_HPX)
cmake_minimum_required(VERSION 3.10)
find_package (HPX)
if (HPX_FOUND)
add_compile_definitions(GGML_USE_HPX)
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${HPX_CXXFLAGS})
else()
message(FATAL "HPX not found")
endif()
endif()
if (LLAMA_CLBLAST)
find_package(CLBlast)
if (CLBlast_FOUND)
@ -767,7 +783,11 @@ add_library(ggml OBJECT
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
target_compile_features(ggml PUBLIC c_std_11) # don't bump
target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
if(LLAMA_HPX AND HPX_FOUND)
target_link_libraries(ggml PUBLIC HPX::hpx ${LLAMA_EXTRA_LIBS})
else()
target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
endif()
if (GGML_USE_CPU_HBM)
target_link_libraries(ggml PUBLIC memkind)
endif()
@ -789,6 +809,12 @@ add_library(llama
target_include_directories(llama PUBLIC .)
target_compile_features(llama PUBLIC cxx_std_11) # don't bump
if(LLAMA_HPX AND HPX_FOUND)
target_link_libraries(llama PUBLIC HPX::hpx ${LLAMA_EXTRA_LIBS})
target_compile_options (llama PRIVATE ${HPX_CXXFLAGS})
endif()
target_link_libraries(llama PRIVATE
ggml
${LLAMA_EXTRA_LIBS}

View file

@ -103,7 +103,11 @@ endif
# keep standard at C11 and C++11
MK_CPPFLAGS = -I. -Icommon
MK_CFLAGS = -std=c11 -fPIC
ifdef LLAMA_HPX
MK_CXXFLAGS = -std=c++17 -fPIC
else
MK_CXXFLAGS = -std=c++11 -fPIC
endif
# -Ofast tends to produce faster code, but may not be available for some compilers.
ifdef LLAMA_FAST
@ -354,6 +358,46 @@ ifdef LLAMA_MPI
OBJS += ggml-mpi.o
endif # LLAMA_MPI
ifdef LLAMA_HPX
ifndef HWLOC_FOUND
HWLOC_PKG:=hwloc
HWLOC_REQPKG:=$(shell pkg-config --exists $(HWLOC_PKG) && echo '$(HWLOC_PKG)')
ifneq ($(HWLOC_REQPKG),)
HWLOC_FOUND:=1
HWLOC_CXXFLAGS:=$(shell pkg-config --cflags $(HWLOC_PKG))
HWLOC_LDFLAGS:=$(shell pkg-config --libs $(HWLOC_PKG))
warn := $(warning hwloc found)
else
$(warning 'hwloc' not found)
endif
endif
ifndef HWLOC_FOUND
$(error hwloc not found)
endif
ifndef HPX_FOUND
HPX_PKG:=hpx_component
HPX_REQPKG:=$(shell pkg-config --exists $(HPX_PKG) && echo '$(HPX_PKG)')
ifneq ($(HPX_REQPKG),)
HPX_FOUND:=1
HPX_CXXFLAGS:=$(shell pkg-config --cflags hpx_component)
HPX_LDFLAGS:=$(shell pkg-config --libs hpx_component)
warn := $(warning HPX found)
else
$(warning 'HPX' not found)
endif
endif
ifndef HPX_FOUND
$(error HPX not found)
endif
MK_CPPFLAGS += -DGGML_USE_HPX $(HWLOC_CXXFLAGS) $(HPX_CXXFLAGS)
MK_CXXFLAGS += -Wno-cast-qual $(HWLOC_CXXFLAGS) $(HPX_CXXFLAGS)
MK_LDFLAGS += -Wno-cast-qual $(HWLOC_LDFLAGS) $(HPX_LDFLAGS)
endif # LLAMA_HPX
ifdef LLAMA_OPENBLAS
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)

View file

@ -342,6 +342,24 @@ Finally, you're ready to run a computation using `mpirun`:
mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
```
### HPX Build
This build has a dependency on the [HPX](https://github.com/STEllAR-GROUP/hpx) asynchronous many task runtime system. Users are encouraged to compile HPX with tcmalloc. HPX provides a user-land thread implementation and a work-stealing thread management implementation. Both features reduce the number of system calls required of the application which can improve performance. HPX emphasizes 'futurization' of applications; users are encouraged to craft dataflow dependency graphs using futures and HPX's implementation of `std::async`. HPX achieves best performance on large workloads. The BLIS BLAS library has support for HPX. The HPX support provided by this build will improve the performance of the BLIS HPX backend when applied to llama.cpp.
- Using `make`:
- On Linux:
```bash
make LLAMA_HPX=1
```
- Using `CMake` on Linux:
```bash
mkdir build
cd build
CXX=<C++ compiler used to build HPX> cmake -DHPX_DIR=<PATH_TO_HPX_CMAKE> -DLLAMA_HPX=1 ..
make
```
### BLAS Build
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:

181
llama.cpp
View file

@ -19,6 +19,13 @@
#ifdef GGML_USE_MPI
# include "ggml-mpi.h"
#endif
#ifdef GGML_USE_HPX
# include <cstdlib>
# include <algorithm>
# include <hpx/hpx_start.hpp>
# include <hpx/runtime_local/run_as_hpx_thread.hpp>
# include <hpx/execution.hpp>
#endif
#ifndef QK_K
# ifdef GGML_QKK_64
# define QK_K 64
@ -8328,6 +8335,100 @@ struct quantize_state_internal {
{}
};
#if defined(GGML_USE_HPX)
static void llama_convert_tensor_internal(
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<hpx::future<void>> & futures,
const size_t nelements, const int nthread
) {
if (output.size() < nelements) {
output.resize(nelements);
}
float * f32_output = (float *) output.data();
ggml_type_traits_t qtype;
if (ggml_is_quantized(tensor->type)) {
qtype = ggml_internal_get_type_traits(tensor->type);
if (qtype.to_float == NULL) {
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
}
} else if (tensor->type != GGML_TYPE_F16) {
throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
}
if (nthread < 2) {
if (tensor->type == GGML_TYPE_F16) {
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
} else if (ggml_is_quantized(tensor->type)) {
qtype.to_float(tensor->data, f32_output, nelements);
} else {
GGML_ASSERT(false); // unreachable
}
return;
}
size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
size_t block_size_bytes = ggml_type_size(tensor->type);
GGML_ASSERT(nelements % block_size == 0);
size_t nblocks = nelements / block_size;
size_t blocks_per_thread = nblocks / nthread;
size_t spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
size_t in_buff_offs = 0;
size_t out_buff_offs = 0;
hpx::future<void> fut =
hpx::run_as_hpx_thread([&futures, nthread, qtype, block_size, block_size_bytes, blocks_per_thread, spare_blocks, &tensor, &in_buff_offs, &f32_output, &out_buff_offs]() -> hpx::future<void>
{
for (int tnum = 1; tnum < nthread; tnum++) {
size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
if (typ == GGML_TYPE_F16) {
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
} else {
qtype.to_float(inbuf, outbuf, nels);
}
};
futures.push_back(hpx::async(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
in_buff_offs += thr_block_bytes;
out_buff_offs += thr_elems;
}
{
size_t thr_blocks = blocks_per_thread + (0 == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
if (typ == GGML_TYPE_F16) {
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
} else {
qtype.to_float(inbuf, outbuf, nels);
}
};
compute(tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
in_buff_offs += thr_block_bytes;
out_buff_offs += thr_elems;
}
hpx::wait_all(futures);
return hpx::make_ready_future<void>();
});
fut.wait();
futures.clear();
}
#else
static void llama_convert_tensor_internal(
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
const size_t nelements, const int nthread
@ -8389,6 +8490,8 @@ static void llama_convert_tensor_internal(
workers.clear();
}
#endif
static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
const std::string name = ggml_get_name(tensor);
@ -8601,9 +8704,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
size_t total_size_new = 0;
std::vector<int64_t> hist_all(1 << 4, 0);
#if defined(GGML_USE_HPX)
std::vector<hpx::future<void>> futures;
futures.reserve(nthread-1);
hpx::mutex mutex;
#else
std::vector<std::thread> workers;
workers.reserve(nthread);
std::mutex mutex;
#endif
int idx = 0;
@ -8686,7 +8795,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
} else {
#if defined(GGML_USE_HPX)
llama_convert_tensor_internal(tensor, f32_conv_buf, futures, nelements, nthread);
#else
llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
#endif
f32_data = (float *) f32_conv_buf.data();
}
@ -8707,6 +8820,56 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
} else {
size_t counter = 0;
new_size = 0;
#if defined(GGML_USE_HPX)
std::vector<std::array<int64_t, 1 << 4>> thread_local_hist(nthread_use);
std::vector<std::size_t> local_sizes(nthread_use, 0);
std::vector<std::size_t> counters(nthread_use, counter);
std::generate(counters.begin(), counters.end(), [n = 0]() mutable { return (++n) * (32 * 512); });
std::function<hpx::future<void>(const std::size_t, const std::size_t, std::vector<std::array<int64_t, 1 << 4>> &, std::vector<std::size_t> &)> computefn =
[new_type, f32_data, new_data, nelements](const std::size_t thread, const std::size_t counter, std::vector<std::array<int64_t, 1 << 4>> & thread_local_hist, std::vector<std::size_t> & local_sizes) -> hpx::future<void> {
std::array<int64_t, 1 << 4> & local_hist = thread_local_hist[thread];
std::size_t & local_size = local_sizes[thread];
std::size_t first = counter;
while(true) {
first = counter;
if (first >= nelements) {
if (local_size > 0) {
for (int j=0; j<int(local_hist.size()); ++j) {
local_hist[j] += local_hist[j];
}
}
break;
}
size_t last = std::min(nelements, first + chunk_size);
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
}
return hpx::make_ready_future<void>();
};
for (int it = 1; it < nthread_use - 1; ++it) {
futures.push_back(hpx::run_as_hpx_thread(computefn, it, counters[it], thread_local_hist, local_sizes));
}
hpx::future<void> this_fut =
computefn(0, counters[0], thread_local_hist, local_sizes);
hpx::wait_all(futures);
this_fut.wait();
for(auto & local_hist : thread_local_hist) {
for(auto j = 0; j < int(local_hist.size()); ++j) {
hist_cur[j] += local_hist[j];
}
}
new_size = std::reduce(local_sizes.begin(), local_sizes.end(), new_size, std::plus<std::size_t>{});
futures.clear();
#else
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() {
std::array<int64_t, 1 << 4> local_hist = {};
size_t local_size = 0;
@ -8727,12 +8890,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
}
};
for (int it = 0; it < nthread_use - 1; ++it) {
workers.emplace_back(compute);
}
compute();
for (auto & w : workers) { w.join(); }
workers.clear();
#endif
}
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
@ -9160,6 +9325,7 @@ void llama_backend_init(bool numa) {
struct ggml_init_params params = { 0, NULL, false };
struct ggml_context * ctx = ggml_init(params);
ggml_free(ctx);
}
if (numa) {
@ -9169,12 +9335,27 @@ void llama_backend_init(bool numa) {
#ifdef GGML_USE_MPI
ggml_mpi_backend_init();
#endif
#ifdef GGML_USE_HPX
{
const auto nthread = std::thread::hardware_concurrency();
std::string thread_arg = "--hpx:threads=" + std::to_string(nthread);
hpx::init_params params;
params.cfg = { thread_arg };
hpx::start(nullptr, 0, nullptr, params);
}
#endif
}
void llama_backend_free(void) {
#ifdef GGML_USE_MPI
ggml_mpi_backend_free();
#endif
#ifdef GGML_USE_HPX
{
hpx::post([]() { hpx::finalize(); });
hpx::stop();
}
#endif
}
int64_t llama_time_us(void) {