From 4bd781cd2572de9ec022178e9973f79cd1c7b278 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5kon=20H=2E=20Hitland?= Date: Tue, 18 Apr 2023 00:57:30 +0200 Subject: [PATCH] q4_0c: quantize support --- examples/quantize/quantize.cpp | 1 + ggml.c | 41 ++++++++++++++++++++++++++++++---- ggml.h | 3 ++- llama.cpp | 13 +++++++---- llama.h | 1 + 5 files changed, 50 insertions(+), 9 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 198bd5fcb..bc903d209 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -8,6 +8,7 @@ static const std::map LLAMA_FTYPE_MAP = { {"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0}, + {"q4_0c", LLAMA_FTYPE_MOSTLY_Q4_0C}, {"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1}, {"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2}, {"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0}, diff --git a/ggml.c b/ggml.c index 84dd55100..f481774a4 100644 --- a/ggml.c +++ b/ggml.c @@ -774,11 +774,17 @@ static_assert(sizeof(block_q8_1) == 3*sizeof(float) + QK8_1, "wrong q8_1 block s #define QK4_0C (4*32) #define QK4_0C_MUL (QK4_0C / QK4_0) -// TODO: nicer description - pseudostruct? -// q4_0c : (uint8_t[QK4_0C/2]) qs[nb] || float d[n] +#define Q4_0C_QSIZE (QK4_0C/2 + 4*sizeof(float)) +// typedef struct { +// uint8_t qs[QK4_0C/2][nb]; +// float d[nb]; +// } block_q4_0c #define QK8_0C 32 -// q8_0c : uint8_t qs[n] || float d[n] +// typedef struct { +// uint8_t qs[QK8_0C][nb]; +// float d[nb]; +// } block_q8_0c // reference implementation for deterministic creation of model files static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) { @@ -13102,6 +13108,27 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * return (n/QK4_0*sizeof(block_q4_0)); } +size_t ggml_quantize_q4_0c(const float * src, void * dst, int n, int k, int64_t * hist) { + assert(k % QK4_0C == 0); + const int nb = k / QK4_0; + + for (int j = 0; j < n; j += k) { + uint8_t * restrict y = (uint8_t *)dst + sizeof(block_q4_0)*j/QK4_0; + + quantize_row_q4_0c_reference(src + j, y, k); + + for (int i = 0; i < nb*QK4_0/2; i++) { + const uint8_t vi0 = y[i] & 0xF; + const uint8_t vi1 = y[i] >> 4; + + hist[vi0]++; + hist[vi1]++; + } + } + + return (n/QK4_0*sizeof(block_q4_0)); +} + size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) { assert(k % QK4_1 == 0); const int nb = k / QK4_1; @@ -13229,7 +13256,7 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * return (n/QK8_0*sizeof(block_q8_0)); } -size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) { +size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int k, int64_t * hist) { size_t result = 0; switch (type) { case GGML_TYPE_Q4_0: @@ -13238,6 +13265,12 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i block_q4_0 * block = (block_q4_0*)dst + start / QK4_0; result = ggml_quantize_q4_0(src + start, block, n, n, hist); } break; + case GGML_TYPE_Q4_0C: + { + GGML_ASSERT(start % QK4_0C == 0); + uint8_t * dst_off = (uint8_t *) dst + Q4_0C_QSIZE * start / QK4_0C; + result = ggml_quantize_q4_0c(src + start, dst_off, n, k, hist); + } break; case GGML_TYPE_Q4_1: { GGML_ASSERT(start % QK4_1 == 0); diff --git a/ggml.h b/ggml.h index 3c1807736..2b502b2fb 100644 --- a/ggml.h +++ b/ggml.h @@ -871,13 +871,14 @@ extern "C" { // GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q4_0c(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist); - GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); + GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int k, int64_t * hist); // // system info diff --git a/llama.cpp b/llama.cpp index 868a58a8b..59747a16c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -481,6 +481,7 @@ struct llama_file_loader { case GGML_TYPE_F32: case GGML_TYPE_F16: case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_0C: case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_2: case GGML_TYPE_Q5_0: @@ -557,6 +558,7 @@ struct llama_file_saver { case GGML_TYPE_F32: case GGML_TYPE_F16: case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_0C: case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_2: case GGML_TYPE_Q5_0: @@ -846,6 +848,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) { case LLAMA_FTYPE_ALL_F32: return "all F32"; case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16"; case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0"; + case LLAMA_FTYPE_MOSTLY_Q4_0C: return "mostly Q4_0C"; case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1"; case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: return "mostly Q4_1, some F16"; @@ -1880,6 +1883,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s ggml_type quantized_type; switch (ftype) { case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break; + case LLAMA_FTYPE_MOSTLY_Q4_0C: quantized_type = GGML_TYPE_Q4_0C; break; case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break; case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break; case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break; @@ -1961,15 +1965,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s new_data = work.addr; std::vector hist_cur(1 << 4, 0); - int chunk_size = 32 * 512; + int row_size = tensor.ne.at(0); + int chunk_size = ceil(32 * 512 * 1.0 / row_size) * row_size; const int nchunk = (nelements + chunk_size - 1)/chunk_size; const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1; if (nthread_use < 2) { - new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data()); + new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, row_size, hist_cur.data()); } else { size_t counter = 0; new_size = 0; - auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () { + auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size, row_size] () { std::vector local_hist; size_t local_size = 0; while (true) { @@ -1985,7 +1990,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s lock.unlock(); size_t last = std::min(nelements, first + chunk_size); if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0); - local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data()); + local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, row_size, local_hist.data()); } }; if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1); diff --git a/llama.h b/llama.h index 2f6ce8d83..94c3e56b1 100644 --- a/llama.h +++ b/llama.h @@ -83,6 +83,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_0C = 20, // except 1d tensors }; LLAMA_API struct llama_context_params llama_context_default_params();