Merge c9c4e1f077
into de473f5f8e
This commit is contained in:
commit
51d3f485cd
4 changed files with 270 additions and 1 deletions
|
@ -97,6 +97,7 @@ option(LLAMA_METAL "llama: use Metal"
|
|||
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
|
||||
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF)
|
||||
option(LLAMA_MPI "llama: use MPI" OFF)
|
||||
option(LLAMA_HPX "llama: use HPX" OFF)
|
||||
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
||||
|
||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||
|
@ -357,6 +358,10 @@ if (LLAMA_CUBLAS)
|
|||
endif()
|
||||
endif()
|
||||
|
||||
if(LLAMA_MPI AND LLAMA_HPX)
|
||||
message(FATAL "MPI and HPX are not currently compatible together")
|
||||
endif()
|
||||
|
||||
if (LLAMA_MPI)
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
find_package(MPI)
|
||||
|
@ -381,6 +386,17 @@ if (LLAMA_MPI)
|
|||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_HPX)
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
find_package (HPX)
|
||||
if (HPX_FOUND)
|
||||
add_compile_definitions(GGML_USE_HPX)
|
||||
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${HPX_CXXFLAGS})
|
||||
else()
|
||||
message(FATAL "HPX not found")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_CLBLAST)
|
||||
find_package(CLBlast)
|
||||
if (CLBlast_FOUND)
|
||||
|
@ -767,7 +783,11 @@ add_library(ggml OBJECT
|
|||
|
||||
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
|
||||
target_compile_features(ggml PUBLIC c_std_11) # don't bump
|
||||
target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
|
||||
if(LLAMA_HPX AND HPX_FOUND)
|
||||
target_link_libraries(ggml PUBLIC HPX::hpx ${LLAMA_EXTRA_LIBS})
|
||||
else()
|
||||
target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
|
||||
endif()
|
||||
if (GGML_USE_CPU_HBM)
|
||||
target_link_libraries(ggml PUBLIC memkind)
|
||||
endif()
|
||||
|
@ -789,6 +809,12 @@ add_library(llama
|
|||
|
||||
target_include_directories(llama PUBLIC .)
|
||||
target_compile_features(llama PUBLIC cxx_std_11) # don't bump
|
||||
|
||||
if(LLAMA_HPX AND HPX_FOUND)
|
||||
target_link_libraries(llama PUBLIC HPX::hpx ${LLAMA_EXTRA_LIBS})
|
||||
target_compile_options (llama PRIVATE ${HPX_CXXFLAGS})
|
||||
endif()
|
||||
|
||||
target_link_libraries(llama PRIVATE
|
||||
ggml
|
||||
${LLAMA_EXTRA_LIBS}
|
||||
|
|
44
Makefile
44
Makefile
|
@ -103,7 +103,11 @@ endif
|
|||
# keep standard at C11 and C++11
|
||||
MK_CPPFLAGS = -I. -Icommon
|
||||
MK_CFLAGS = -std=c11 -fPIC
|
||||
ifdef LLAMA_HPX
|
||||
MK_CXXFLAGS = -std=c++17 -fPIC
|
||||
else
|
||||
MK_CXXFLAGS = -std=c++11 -fPIC
|
||||
endif
|
||||
|
||||
# -Ofast tends to produce faster code, but may not be available for some compilers.
|
||||
ifdef LLAMA_FAST
|
||||
|
@ -354,6 +358,46 @@ ifdef LLAMA_MPI
|
|||
OBJS += ggml-mpi.o
|
||||
endif # LLAMA_MPI
|
||||
|
||||
ifdef LLAMA_HPX
|
||||
ifndef HWLOC_FOUND
|
||||
HWLOC_PKG:=hwloc
|
||||
HWLOC_REQPKG:=$(shell pkg-config --exists $(HWLOC_PKG) && echo '$(HWLOC_PKG)')
|
||||
ifneq ($(HWLOC_REQPKG),)
|
||||
HWLOC_FOUND:=1
|
||||
HWLOC_CXXFLAGS:=$(shell pkg-config --cflags $(HWLOC_PKG))
|
||||
HWLOC_LDFLAGS:=$(shell pkg-config --libs $(HWLOC_PKG))
|
||||
warn := $(warning hwloc found)
|
||||
else
|
||||
$(warning 'hwloc' not found)
|
||||
endif
|
||||
endif
|
||||
|
||||
ifndef HWLOC_FOUND
|
||||
$(error hwloc not found)
|
||||
endif
|
||||
|
||||
ifndef HPX_FOUND
|
||||
HPX_PKG:=hpx_component
|
||||
HPX_REQPKG:=$(shell pkg-config --exists $(HPX_PKG) && echo '$(HPX_PKG)')
|
||||
ifneq ($(HPX_REQPKG),)
|
||||
HPX_FOUND:=1
|
||||
HPX_CXXFLAGS:=$(shell pkg-config --cflags hpx_component)
|
||||
HPX_LDFLAGS:=$(shell pkg-config --libs hpx_component)
|
||||
warn := $(warning HPX found)
|
||||
else
|
||||
$(warning 'HPX' not found)
|
||||
endif
|
||||
endif
|
||||
|
||||
ifndef HPX_FOUND
|
||||
$(error HPX not found)
|
||||
endif
|
||||
|
||||
MK_CPPFLAGS += -DGGML_USE_HPX $(HWLOC_CXXFLAGS) $(HPX_CXXFLAGS)
|
||||
MK_CXXFLAGS += -Wno-cast-qual $(HWLOC_CXXFLAGS) $(HPX_CXXFLAGS)
|
||||
MK_LDFLAGS += -Wno-cast-qual $(HWLOC_LDFLAGS) $(HPX_LDFLAGS)
|
||||
endif # LLAMA_HPX
|
||||
|
||||
ifdef LLAMA_OPENBLAS
|
||||
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
|
||||
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
|
||||
|
|
18
README.md
18
README.md
|
@ -342,6 +342,24 @@ Finally, you're ready to run a computation using `mpirun`:
|
|||
mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
|
||||
```
|
||||
|
||||
### HPX Build
|
||||
|
||||
This build has a dependency on the [HPX](https://github.com/STEllAR-GROUP/hpx) asynchronous many task runtime system. Users are encouraged to compile HPX with tcmalloc. HPX provides a user-land thread implementation and a work-stealing thread management implementation. Both features reduce the number of system calls required of the application which can improve performance. HPX emphasizes 'futurization' of applications; users are encouraged to craft dataflow dependency graphs using futures and HPX's implementation of `std::async`. HPX achieves best performance on large workloads. The BLIS BLAS library has support for HPX. The HPX support provided by this build will improve the performance of the BLIS HPX backend when applied to llama.cpp.
|
||||
|
||||
- Using `make`:
|
||||
- On Linux:
|
||||
```bash
|
||||
make LLAMA_HPX=1
|
||||
```
|
||||
|
||||
- Using `CMake` on Linux:
|
||||
```bash
|
||||
mkdir build
|
||||
cd build
|
||||
CXX=<C++ compiler used to build HPX> cmake -DHPX_DIR=<PATH_TO_HPX_CMAKE> -DLLAMA_HPX=1 ..
|
||||
make
|
||||
```
|
||||
|
||||
### BLAS Build
|
||||
|
||||
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:
|
||||
|
|
181
llama.cpp
181
llama.cpp
|
@ -19,6 +19,13 @@
|
|||
#ifdef GGML_USE_MPI
|
||||
# include "ggml-mpi.h"
|
||||
#endif
|
||||
#ifdef GGML_USE_HPX
|
||||
# include <cstdlib>
|
||||
# include <algorithm>
|
||||
# include <hpx/hpx_start.hpp>
|
||||
# include <hpx/runtime_local/run_as_hpx_thread.hpp>
|
||||
# include <hpx/execution.hpp>
|
||||
#endif
|
||||
#ifndef QK_K
|
||||
# ifdef GGML_QKK_64
|
||||
# define QK_K 64
|
||||
|
@ -8328,6 +8335,100 @@ struct quantize_state_internal {
|
|||
{}
|
||||
};
|
||||
|
||||
#if defined(GGML_USE_HPX)
|
||||
|
||||
static void llama_convert_tensor_internal(
|
||||
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<hpx::future<void>> & futures,
|
||||
const size_t nelements, const int nthread
|
||||
) {
|
||||
if (output.size() < nelements) {
|
||||
output.resize(nelements);
|
||||
}
|
||||
float * f32_output = (float *) output.data();
|
||||
|
||||
ggml_type_traits_t qtype;
|
||||
if (ggml_is_quantized(tensor->type)) {
|
||||
qtype = ggml_internal_get_type_traits(tensor->type);
|
||||
if (qtype.to_float == NULL) {
|
||||
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
|
||||
}
|
||||
} else if (tensor->type != GGML_TYPE_F16) {
|
||||
throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
|
||||
}
|
||||
|
||||
if (nthread < 2) {
|
||||
if (tensor->type == GGML_TYPE_F16) {
|
||||
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
|
||||
} else if (ggml_is_quantized(tensor->type)) {
|
||||
qtype.to_float(tensor->data, f32_output, nelements);
|
||||
} else {
|
||||
GGML_ASSERT(false); // unreachable
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
|
||||
size_t block_size_bytes = ggml_type_size(tensor->type);
|
||||
|
||||
GGML_ASSERT(nelements % block_size == 0);
|
||||
size_t nblocks = nelements / block_size;
|
||||
size_t blocks_per_thread = nblocks / nthread;
|
||||
size_t spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
|
||||
|
||||
size_t in_buff_offs = 0;
|
||||
size_t out_buff_offs = 0;
|
||||
|
||||
hpx::future<void> fut =
|
||||
hpx::run_as_hpx_thread([&futures, nthread, qtype, block_size, block_size_bytes, blocks_per_thread, spare_blocks, &tensor, &in_buff_offs, &f32_output, &out_buff_offs]() -> hpx::future<void>
|
||||
{
|
||||
for (int tnum = 1; tnum < nthread; tnum++) {
|
||||
size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
|
||||
size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
|
||||
size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
|
||||
|
||||
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
|
||||
if (typ == GGML_TYPE_F16) {
|
||||
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
|
||||
} else {
|
||||
qtype.to_float(inbuf, outbuf, nels);
|
||||
}
|
||||
};
|
||||
|
||||
futures.push_back(hpx::async(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
|
||||
|
||||
in_buff_offs += thr_block_bytes;
|
||||
out_buff_offs += thr_elems;
|
||||
}
|
||||
|
||||
{
|
||||
size_t thr_blocks = blocks_per_thread + (0 == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
|
||||
size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
|
||||
size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
|
||||
|
||||
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
|
||||
if (typ == GGML_TYPE_F16) {
|
||||
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
|
||||
} else {
|
||||
qtype.to_float(inbuf, outbuf, nels);
|
||||
}
|
||||
};
|
||||
|
||||
compute(tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
|
||||
|
||||
in_buff_offs += thr_block_bytes;
|
||||
out_buff_offs += thr_elems;
|
||||
}
|
||||
|
||||
hpx::wait_all(futures);
|
||||
return hpx::make_ready_future<void>();
|
||||
});
|
||||
|
||||
fut.wait();
|
||||
futures.clear();
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static void llama_convert_tensor_internal(
|
||||
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
|
||||
const size_t nelements, const int nthread
|
||||
|
@ -8389,6 +8490,8 @@ static void llama_convert_tensor_internal(
|
|||
workers.clear();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
|
||||
const std::string name = ggml_get_name(tensor);
|
||||
|
||||
|
@ -8601,9 +8704,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
size_t total_size_new = 0;
|
||||
std::vector<int64_t> hist_all(1 << 4, 0);
|
||||
|
||||
#if defined(GGML_USE_HPX)
|
||||
std::vector<hpx::future<void>> futures;
|
||||
futures.reserve(nthread-1);
|
||||
hpx::mutex mutex;
|
||||
#else
|
||||
std::vector<std::thread> workers;
|
||||
workers.reserve(nthread);
|
||||
std::mutex mutex;
|
||||
#endif
|
||||
|
||||
int idx = 0;
|
||||
|
||||
|
@ -8686,7 +8795,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
|
||||
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
|
||||
} else {
|
||||
#if defined(GGML_USE_HPX)
|
||||
llama_convert_tensor_internal(tensor, f32_conv_buf, futures, nelements, nthread);
|
||||
#else
|
||||
llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
|
||||
#endif
|
||||
f32_data = (float *) f32_conv_buf.data();
|
||||
}
|
||||
|
||||
|
@ -8707,6 +8820,56 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
} else {
|
||||
size_t counter = 0;
|
||||
new_size = 0;
|
||||
|
||||
#if defined(GGML_USE_HPX)
|
||||
std::vector<std::array<int64_t, 1 << 4>> thread_local_hist(nthread_use);
|
||||
std::vector<std::size_t> local_sizes(nthread_use, 0);
|
||||
std::vector<std::size_t> counters(nthread_use, counter);
|
||||
std::generate(counters.begin(), counters.end(), [n = 0]() mutable { return (++n) * (32 * 512); });
|
||||
|
||||
std::function<hpx::future<void>(const std::size_t, const std::size_t, std::vector<std::array<int64_t, 1 << 4>> &, std::vector<std::size_t> &)> computefn =
|
||||
[new_type, f32_data, new_data, nelements](const std::size_t thread, const std::size_t counter, std::vector<std::array<int64_t, 1 << 4>> & thread_local_hist, std::vector<std::size_t> & local_sizes) -> hpx::future<void> {
|
||||
|
||||
std::array<int64_t, 1 << 4> & local_hist = thread_local_hist[thread];
|
||||
std::size_t & local_size = local_sizes[thread];
|
||||
std::size_t first = counter;
|
||||
|
||||
while(true) {
|
||||
first = counter;
|
||||
if (first >= nelements) {
|
||||
if (local_size > 0) {
|
||||
for (int j=0; j<int(local_hist.size()); ++j) {
|
||||
local_hist[j] += local_hist[j];
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
size_t last = std::min(nelements, first + chunk_size);
|
||||
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
|
||||
}
|
||||
return hpx::make_ready_future<void>();
|
||||
};
|
||||
|
||||
for (int it = 1; it < nthread_use - 1; ++it) {
|
||||
futures.push_back(hpx::run_as_hpx_thread(computefn, it, counters[it], thread_local_hist, local_sizes));
|
||||
}
|
||||
|
||||
hpx::future<void> this_fut =
|
||||
computefn(0, counters[0], thread_local_hist, local_sizes);
|
||||
|
||||
hpx::wait_all(futures);
|
||||
|
||||
this_fut.wait();
|
||||
|
||||
for(auto & local_hist : thread_local_hist) {
|
||||
for(auto j = 0; j < int(local_hist.size()); ++j) {
|
||||
hist_cur[j] += local_hist[j];
|
||||
}
|
||||
}
|
||||
|
||||
new_size = std::reduce(local_sizes.begin(), local_sizes.end(), new_size, std::plus<std::size_t>{});
|
||||
futures.clear();
|
||||
#else
|
||||
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() {
|
||||
std::array<int64_t, 1 << 4> local_hist = {};
|
||||
size_t local_size = 0;
|
||||
|
@ -8727,12 +8890,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
|
||||
}
|
||||
};
|
||||
|
||||
for (int it = 0; it < nthread_use - 1; ++it) {
|
||||
workers.emplace_back(compute);
|
||||
}
|
||||
compute();
|
||||
for (auto & w : workers) { w.join(); }
|
||||
workers.clear();
|
||||
#endif
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
||||
|
@ -9160,6 +9325,7 @@ void llama_backend_init(bool numa) {
|
|||
struct ggml_init_params params = { 0, NULL, false };
|
||||
struct ggml_context * ctx = ggml_init(params);
|
||||
ggml_free(ctx);
|
||||
|
||||
}
|
||||
|
||||
if (numa) {
|
||||
|
@ -9169,12 +9335,27 @@ void llama_backend_init(bool numa) {
|
|||
#ifdef GGML_USE_MPI
|
||||
ggml_mpi_backend_init();
|
||||
#endif
|
||||
#ifdef GGML_USE_HPX
|
||||
{
|
||||
const auto nthread = std::thread::hardware_concurrency();
|
||||
std::string thread_arg = "--hpx:threads=" + std::to_string(nthread);
|
||||
hpx::init_params params;
|
||||
params.cfg = { thread_arg };
|
||||
hpx::start(nullptr, 0, nullptr, params);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void llama_backend_free(void) {
|
||||
#ifdef GGML_USE_MPI
|
||||
ggml_mpi_backend_free();
|
||||
#endif
|
||||
#ifdef GGML_USE_HPX
|
||||
{
|
||||
hpx::post([]() { hpx::finalize(); });
|
||||
hpx::stop();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
int64_t llama_time_us(void) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue