llama : multi-threaded quantization (#1075)

* Multi-threading quantization.

Not much gain for simple quantizations, bit it will be important
for quantizations that require more CPU cycles.

* Multi-threading for quantize-stats

It now does the job in ~14 seconds on my Mac for
Q4_0, Q4_1 and Q4_2. Single-threaded it was taking
more than 2 minutes after adding the more elaborate
version of Q4_2.

* Reviewer comments

* Avoiding compiler confusion

After changing chunk_size to const int as suggested by
@ggerganov, clang and GCC starting to warn me that I don't
need to capture it in the lambda. So, I removed it from the
capture list. But that makes the MSVC build fail. So,
making it a constexpr to make every compiler happy.

* Still fighting with lambda captures in MSVC

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Kawrakow 2023-04-20 19:42:27 +02:00 committed by GitHub
parent e0305ead3a
commit 38de86a711
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 183 additions and 61 deletions

View file

@ -24,6 +24,9 @@
#include <memory>
#include <algorithm>
#include <initializer_list>
#include <thread>
#include <atomic>
#include <mutex>
#define LLAMA_USE_SCRATCH
#define LLAMA_MAX_SCRATCH_BUFFERS 16
@ -1572,7 +1575,7 @@ static llama_vocab::id llama_sample_top_p_top_k(
// quantization
//
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) {
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
ggml_type quantized_type;
switch (ftype) {
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
@ -1582,6 +1585,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
default: throw format("invalid output file type %d\n", ftype);
};
if (nthread <= 0) {
nthread = std::thread::hardware_concurrency();
}
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
/*vocab_only*/ false));
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
@ -1590,6 +1597,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
size_t total_size_new = 0;
std::vector<int64_t> hist_all(1 << 4, 0);
std::vector<std::thread> workers;
std::mutex mutex;
size_t idx = 0;
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
llama_buffer read_data;
@ -1643,25 +1653,37 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
new_data = work.addr;
std::vector<int64_t> hist_cur(1 << 4, 0);
switch (new_type) {
case GGML_TYPE_Q4_0:
{
new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
} break;
case GGML_TYPE_Q4_1:
{
new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
} break;
case GGML_TYPE_Q4_2:
{
new_size = ggml_quantize_q4_2(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
} break;
case GGML_TYPE_Q4_3:
{
new_size = ggml_quantize_q4_3(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
} break;
default:
LLAMA_ASSERT(false);
int chunk_size = 32 * 512;
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
if (nthread_use < 2) {
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
} else {
size_t counter = 0;
new_size = 0;
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
std::vector<int64_t> local_hist;
size_t local_size = 0;
while (true) {
std::unique_lock<std::mutex> lock(mutex);
size_t first = counter; counter += chunk_size;
if (first >= nelements) {
if (!local_hist.empty()) {
for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
new_size += local_size;
}
break;
}
lock.unlock();
size_t last = std::min(nelements, first + chunk_size);
if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
}
};
if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
compute();
for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
}
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
@ -1783,9 +1805,10 @@ void llama_free(struct llama_context * ctx) {
int llama_model_quantize(
const char * fname_inp,
const char * fname_out,
enum llama_ftype ftype) {
enum llama_ftype ftype,
int nthread) {
try {
llama_model_quantize_internal(fname_inp, fname_out, ftype);
llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
return 0;
} catch (const std::string & err) {
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());