Multi-threading quantization.

Not much gain for simple quantizations, bit it will be important
for quantizations that require more CPU cycles.
This commit is contained in:
Iwan Kawrakow 2023-04-19 20:20:44 +02:00
parent f7d05095b4
commit d2f9266200
5 changed files with 79 additions and 22 deletions

View file

@ -10,8 +10,8 @@
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
ggml_time_init(); ggml_time_init();
if (argc != 4) { if (argc < 4) {
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type [nthread]\n", argv[0]);
fprintf(stderr, " type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0); fprintf(stderr, " type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0);
fprintf(stderr, " type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1); fprintf(stderr, " type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1);
fprintf(stderr, " type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2); fprintf(stderr, " type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2);
@ -29,6 +29,7 @@ int main(int argc, char ** argv) {
const std::string fname_out = argv[2]; const std::string fname_out = argv[2];
const enum llama_ftype ftype = (enum llama_ftype)atoi(argv[3]); const enum llama_ftype ftype = (enum llama_ftype)atoi(argv[3]);
int nthread = argc > 4 ? atoi(argv[4]) : 0;
const int64_t t_main_start_us = ggml_time_us(); const int64_t t_main_start_us = ggml_time_us();
@ -38,7 +39,7 @@ int main(int argc, char ** argv) {
{ {
const int64_t t_start_us = ggml_time_us(); const int64_t t_start_us = ggml_time_us();
if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype)) { if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) {
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
return 1; return 1;
} }

27
ggml.c
View file

@ -11870,6 +11870,33 @@ size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t *
return (n/QK4_2*sizeof(block_q4_2)); return (n/QK4_2*sizeof(block_q4_2));
} }
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) {
size_t result = 0;
switch (type) {
case GGML_TYPE_Q4_0:
{
assert (start % QK4_0 == 0);
block_q4_0 * block = (block_q4_0*)dst + start / QK4_0;
result = ggml_quantize_q4_0(src + start, block, n, n, hist);
} break;
case GGML_TYPE_Q4_1:
{
assert (start % QK4_1 == 0);
block_q4_1 * block = (block_q4_1*)dst + start / QK4_1;
result = ggml_quantize_q4_1(src + start, block, n, n, hist);
} break;
case GGML_TYPE_Q4_2:
{
assert (start % QK4_2 == 0);
block_q4_2 * block = (block_q4_2*)dst + start / QK4_2;
result = ggml_quantize_q4_2(src + start, block, n, n, hist);
} break;
default:
assert(false);
}
return result;
}
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int ggml_cpu_has_avx(void) { int ggml_cpu_has_avx(void) {

2
ggml.h
View file

@ -809,6 +809,8 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t *
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist); size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
// //
// system info // system info
// //

View file

@ -24,6 +24,9 @@
#include <memory> #include <memory>
#include <algorithm> #include <algorithm>
#include <initializer_list> #include <initializer_list>
#include <thread>
#include <atomic>
#include <mutex>
#define LLAMA_USE_SCRATCH #define LLAMA_USE_SCRATCH
#define LLAMA_MAX_SCRATCH_BUFFERS 16 #define LLAMA_MAX_SCRATCH_BUFFERS 16
@ -1569,7 +1572,7 @@ static llama_vocab::id llama_sample_top_p_top_k(
// quantization // quantization
// //
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) { static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
ggml_type quantized_type; ggml_type quantized_type;
switch (ftype) { switch (ftype) {
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break; case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
@ -1578,6 +1581,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
default: throw format("invalid output file type %d\n", ftype); default: throw format("invalid output file type %d\n", ftype);
}; };
if (nthread <= 0) nthread = std::thread::hardware_concurrency();
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false, std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
/*vocab_only*/ false)); /*vocab_only*/ false));
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype); llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
@ -1586,6 +1591,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
size_t total_size_new = 0; size_t total_size_new = 0;
std::vector<int64_t> hist_all(1 << 4, 0); std::vector<int64_t> hist_all(1 << 4, 0);
std::vector<std::thread> workers;
std::mutex mutex;
size_t idx = 0; size_t idx = 0;
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) { for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
llama_buffer read_data; llama_buffer read_data;
@ -1639,21 +1647,37 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
new_data = work.addr; new_data = work.addr;
std::vector<int64_t> hist_cur(1 << 4, 0); std::vector<int64_t> hist_cur(1 << 4, 0);
switch (new_type) { int chunk_size = 32 * 512;
case GGML_TYPE_Q4_0: int nchunk = (nelements + chunk_size - 1)/chunk_size;
{ int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); if (nthread_use < 2) {
} break; new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
case GGML_TYPE_Q4_1: } else {
{ size_t counter = 0;
new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); new_size = 0;
} break; auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
case GGML_TYPE_Q4_2: std::vector<int64_t> local_hist;
{ size_t local_size = 0;
new_size = ggml_quantize_q4_2(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); while (true) {
} break; std::unique_lock<std::mutex> lock(mutex);
default: size_t first = counter; counter += chunk_size;
LLAMA_ASSERT(false); if (first >= nelements) {
if (!local_hist.empty()) {
for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
new_size += local_size;
}
break;
}
lock.unlock();
size_t last = std::min(nelements, first + chunk_size);
if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
}
};
if (int(workers.size()) < nthread_use-1) workers.resize(nthread_use-1);
for (int it=0; it<nthread_use-1; ++it) workers[it] = std::thread(compute);
compute();
for (int it=0; it<nthread_use-1; ++it) workers[it].join();
} }
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0); printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
@ -1775,9 +1799,10 @@ void llama_free(struct llama_context * ctx) {
int llama_model_quantize( int llama_model_quantize(
const char * fname_inp, const char * fname_inp,
const char * fname_out, const char * fname_out,
enum llama_ftype ftype) { enum llama_ftype ftype,
int nthread) {
try { try {
llama_model_quantize_internal(fname_inp, fname_out, ftype); llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
return 0; return 0;
} catch (const std::string & err) { } catch (const std::string & err) {
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str()); fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());

View file

@ -92,10 +92,12 @@ extern "C" {
// TODO: not great API - very likely to change // TODO: not great API - very likely to change
// Returns 0 on success // Returns 0 on success
// nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
LLAMA_API int llama_model_quantize( LLAMA_API int llama_model_quantize(
const char * fname_inp, const char * fname_inp,
const char * fname_out, const char * fname_out,
enum llama_ftype ftype); enum llama_ftype ftype,
int nthread);
// Apply a LoRA adapter to a loaded model // Apply a LoRA adapter to a loaded model
// path_base_model is the path to a higher quality model to use as a base for // path_base_model is the path to a higher quality model to use as a base for