From b8d69650dc22203b48597921c7a561f5094c4d4d Mon Sep 17 00:00:00 2001 From: Howard Su Date: Wed, 17 May 2023 23:39:39 +0800 Subject: [PATCH 1/6] Upgrade v1 format to v2 by leveraging quantize --- ggml.c | 28 ++++++++++++++++++++++++++++ ggml.h | 1 + llama.cpp | 21 ++++++++++++++++++++- 3 files changed, 49 insertions(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index 77a3d89f7..f4c34f5d1 100644 --- a/ggml.c +++ b/ggml.c @@ -813,6 +813,34 @@ typedef struct { } block_q8_1; static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding"); +void quantize_upgrade(enum ggml_type type, void* data, size_t size) { + + if (type == GGML_TYPE_Q4_0) { + + int qk = ggml_blck_size(type); + const size_t nb = size / sizeof(block_q4_0); + block_q4_0 *blk = (block_q4_0 *)data; + block_q4_0 new_blk; + + for (size_t i = 0; i < nb ; i++) { + for (size_t j = 0; j < qk/4; j++) + { + // old: d0, d1, d2, d3, d4, ....... d_half, d_half1 + // new: d0, d_half, d1, d_half1 + uint8_t d1; + uint8_t d2; + + d1 = blk[i].qs[0 + j]; + d2 = blk[i].qs[qk/4 + j]; + + new_blk.qs[0 + j * 2] = (d1 & 0x0f) | ((d2 & 0x0f) << 4); + new_blk.qs[1 + j * 2] = (d1 >> 4) | (d2 & 0xf0); + } + memcpy(blk[i].qs, new_blk.qs, sizeof(new_blk.qs)); + } + } +} + // reference implementation for deterministic creation of model files static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) { static const int qk = QK4_0; diff --git a/ggml.h b/ggml.h index 51a616c50..787f927cd 100644 --- a/ggml.h +++ b/ggml.h @@ -1086,6 +1086,7 @@ extern "C" { GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); + GGML_API void quantize_upgrade(enum ggml_type type, void* data, size_t size); // // system info // diff --git a/llama.cpp b/llama.cpp index 4cbc8d6b6..fb231d56d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2074,7 +2074,26 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s size_t new_size; llama_buffer work; - if (!quantize) { + if (model_loader->file_loaders.at(0)->file_version == LLAMA_FILE_VERSION_GGJT_V1 && quantize) { + if (tensor.type == GGML_TYPE_Q4_0 && quantized_type == GGML_TYPE_Q4_0) { + // convet + new_type = tensor.type; + new_data = tensor.data; + new_size = tensor.size; + quantize_upgrade(new_type, new_data, new_size); + printf("Upgrade - size = %8.3f MB\n", tensor.size/1024.0/1024.0); + } + else if (tensor.type == GGML_TYPE_Q4_1 && quantized_type == GGML_TYPE_Q4_1) { + new_type = tensor.type; + new_data = tensor.data; + new_size = tensor.size; + quantize_upgrade(new_type, new_data, new_size); + printf("Upgrade - size = %8.3f MB\n", tensor.size/1024.0/1024.0); + } + else { + throw format("type %s unsupported for quantization format upgrade", ggml_type_name(tensor.type)); + } + } else if (!quantize) { new_type = tensor.type; new_data = tensor.data; new_size = tensor.size; From d521d093808422ec9ccf9b0a258fe5106af78ba0 Mon Sep 17 00:00:00 2001 From: Howard Su Date: Wed, 17 May 2023 23:42:17 +0800 Subject: [PATCH 2/6] Support Q4_1 --- ggml.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/ggml.c b/ggml.c index f4c34f5d1..1d8e5b6a9 100644 --- a/ggml.c +++ b/ggml.c @@ -816,15 +816,34 @@ static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block s void quantize_upgrade(enum ggml_type type, void* data, size_t size) { if (type == GGML_TYPE_Q4_0) { - int qk = ggml_blck_size(type); const size_t nb = size / sizeof(block_q4_0); block_q4_0 *blk = (block_q4_0 *)data; block_q4_0 new_blk; for (size_t i = 0; i < nb ; i++) { - for (size_t j = 0; j < qk/4; j++) - { + for (size_t j = 0; j < qk/4; j++) { + // old: d0, d1, d2, d3, d4, ....... d_half, d_half1 + // new: d0, d_half, d1, d_half1 + uint8_t d1; + uint8_t d2; + + d1 = blk[i].qs[0 + j]; + d2 = blk[i].qs[qk/4 + j]; + + new_blk.qs[0 + j * 2] = (d1 & 0x0f) | ((d2 & 0x0f) << 4); + new_blk.qs[1 + j * 2] = (d1 >> 4) | (d2 & 0xf0); + } + memcpy(blk[i].qs, new_blk.qs, sizeof(new_blk.qs)); + } + } else if (type == GGML_TYPE_Q4_1) { + int qk = ggml_blck_size(type); + const size_t nb = size / sizeof(block_q4_1); + block_q4_1 *blk = (block_q4_1 *)data; + block_q4_1 new_blk; + + for (size_t i = 0; i < nb ; i++) { + for (size_t j = 0; j < qk/4; j++) { // old: d0, d1, d2, d3, d4, ....... d_half, d_half1 // new: d0, d_half, d1, d_half1 uint8_t d1; From 10cbc311e3bc9e9fc80858808439f842eec50a27 Mon Sep 17 00:00:00 2001 From: Howard Su Date: Thu, 18 May 2023 09:49:25 +0800 Subject: [PATCH 3/6] Support more data types --- ggml.c | 46 ++++++++++++++++++++++++++++++++++++++++++++-- llama.cpp | 15 +++++---------- 2 files changed, 49 insertions(+), 12 deletions(-) diff --git a/ggml.c b/ggml.c index 1d8e5b6a9..700c36f24 100644 --- a/ggml.c +++ b/ggml.c @@ -822,7 +822,7 @@ void quantize_upgrade(enum ggml_type type, void* data, size_t size) { block_q4_0 new_blk; for (size_t i = 0; i < nb ; i++) { - for (size_t j = 0; j < qk/4; j++) { + for (int j = 0; j < qk/4; j++) { // old: d0, d1, d2, d3, d4, ....... d_half, d_half1 // new: d0, d_half, d1, d_half1 uint8_t d1; @@ -843,7 +843,49 @@ void quantize_upgrade(enum ggml_type type, void* data, size_t size) { block_q4_1 new_blk; for (size_t i = 0; i < nb ; i++) { - for (size_t j = 0; j < qk/4; j++) { + for (int j = 0; j < qk/4; j++) { + // old: d0, d1, d2, d3, d4, ....... d_half, d_half1 + // new: d0, d_half, d1, d_half1 + uint8_t d1; + uint8_t d2; + + d1 = blk[i].qs[0 + j]; + d2 = blk[i].qs[qk/4 + j]; + + new_blk.qs[0 + j * 2] = (d1 & 0x0f) | ((d2 & 0x0f) << 4); + new_blk.qs[1 + j * 2] = (d1 >> 4) | (d2 & 0xf0); + } + memcpy(blk[i].qs, new_blk.qs, sizeof(new_blk.qs)); + } + } else if (type == GGML_TYPE_Q5_0) { + int qk = ggml_blck_size(type); + const size_t nb = size / sizeof(block_q5_0); + block_q5_0 *blk = (block_q5_0 *)data; + block_q5_0 new_blk; + + for (size_t i = 0; i < nb ; i++) { + for (int j = 0; j < qk/4; j++) { + // old: d0, d1, d2, d3, d4, ....... d_half, d_half1 + // new: d0, d_half, d1, d_half1 + uint8_t d1; + uint8_t d2; + + d1 = blk[i].qs[0 + j]; + d2 = blk[i].qs[qk/4 + j]; + + new_blk.qs[0 + j * 2] = (d1 & 0x0f) | ((d2 & 0x0f) << 4); + new_blk.qs[1 + j * 2] = (d1 >> 4) | (d2 & 0xf0); + } + memcpy(blk[i].qs, new_blk.qs, sizeof(new_blk.qs)); + } + } else if (type == GGML_TYPE_Q5_1) { + int qk = ggml_blck_size(type); + const size_t nb = size / sizeof(block_q5_1); + block_q5_1 *blk = (block_q5_1 *)data; + block_q5_1 new_blk; + + for (size_t i = 0; i < nb ; i++) { + for (int j = 0; j < qk/4; j++) { // old: d0, d1, d2, d3, d4, ....... d_half, d_half1 // new: d0, d_half, d1, d_half1 uint8_t d1; diff --git a/llama.cpp b/llama.cpp index fb231d56d..f1a1acbf1 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2075,22 +2075,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s llama_buffer work; if (model_loader->file_loaders.at(0)->file_version == LLAMA_FILE_VERSION_GGJT_V1 && quantize) { - if (tensor.type == GGML_TYPE_Q4_0 && quantized_type == GGML_TYPE_Q4_0) { + if (((tensor.type == GGML_TYPE_Q4_0) + || (tensor.type == GGML_TYPE_Q4_1) + || (tensor.type == GGML_TYPE_Q5_0) + || (tensor.type == GGML_TYPE_Q5_1)) && (quantized_type == tensor.type)) { // convet new_type = tensor.type; new_data = tensor.data; new_size = tensor.size; quantize_upgrade(new_type, new_data, new_size); printf("Upgrade - size = %8.3f MB\n", tensor.size/1024.0/1024.0); - } - else if (tensor.type == GGML_TYPE_Q4_1 && quantized_type == GGML_TYPE_Q4_1) { - new_type = tensor.type; - new_data = tensor.data; - new_size = tensor.size; - quantize_upgrade(new_type, new_data, new_size); - printf("Upgrade - size = %8.3f MB\n", tensor.size/1024.0/1024.0); - } - else { + } else { throw format("type %s unsupported for quantization format upgrade", ggml_type_name(tensor.type)); } } else if (!quantize) { From 006d5707e8e2f47118c71fc597dec464687c08f4 Mon Sep 17 00:00:00 2001 From: Howard Su Date: Sun, 21 May 2023 22:14:27 +0800 Subject: [PATCH 4/6] Support V3 format upgrade --- ggml.c | 136 ++++++++++++++++++++++++++++++++---------------------- ggml.h | 2 +- llama.cpp | 79 +++++++++++++++++++++---------- 3 files changed, 137 insertions(+), 80 deletions(-) diff --git a/ggml.c b/ggml.c index 700c36f24..6cddb3f9e 100644 --- a/ggml.c +++ b/ggml.c @@ -813,91 +813,117 @@ typedef struct { } block_q8_1; static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding"); -void quantize_upgrade(enum ggml_type type, void* data, size_t size) { +static void quantize_shuffle_block(const uint8_t* src, uint8_t* dest, int half_size) +{ + for (int j = 0; j < half_size; j++) { + // old: d0, d1, d2, d3, d4, ....... d_half, d_half1 + // new: d0, d_half, d1, d_half1 + uint8_t d1; + uint8_t d2; + d1 = src[0 + j]; + d2 = src[half_size + j]; + + dest[0 + j * 2] = (d1 & 0x0f) | ((d2 & 0x0f) << 4); + dest[1 + j * 2] = (d1 >> 4) | (d2 & 0xf0); + } +} + +typedef struct { + float d; // delta + uint8_t qs[QK4_0 / 2]; // nibbles / quants +}block_q4_0_old; +typedef struct { + float d; // delta + float m; // min + uint8_t qs[QK4_1 / 2]; // nibbles / quants +} block_q4_1_old; +typedef struct { + float d; // delta + int8_t qs[QK8_0]; // quants +} block_q8_0_old; + +void quantize_upgrade(enum ggml_type type, void* data, size_t * size, bool shuffle) { if (type == GGML_TYPE_Q4_0) { int qk = ggml_blck_size(type); - const size_t nb = size / sizeof(block_q4_0); - block_q4_0 *blk = (block_q4_0 *)data; - block_q4_0 new_blk; + const size_t nb = *size / sizeof(block_q4_0_old); + block_q4_0_old *blk = (block_q4_0_old *)data; + block_q4_0 *new_blk = (block_q4_0 *)data; + block_q4_0 new_blk_buf; + *size = nb * sizeof(block_q4_0); for (size_t i = 0; i < nb ; i++) { - for (int j = 0; j < qk/4; j++) { - // old: d0, d1, d2, d3, d4, ....... d_half, d_half1 - // new: d0, d_half, d1, d_half1 - uint8_t d1; - uint8_t d2; - d1 = blk[i].qs[0 + j]; - d2 = blk[i].qs[qk/4 + j]; + new_blk_buf.d = GGML_FP32_TO_FP16(blk[i].d); - new_blk.qs[0 + j * 2] = (d1 & 0x0f) | ((d2 & 0x0f) << 4); - new_blk.qs[1 + j * 2] = (d1 >> 4) | (d2 & 0xf0); - } - memcpy(blk[i].qs, new_blk.qs, sizeof(new_blk.qs)); + if (shuffle) + quantize_shuffle_block(blk[i].qs, new_blk_buf.qs, qk/4); + else + memcpy(new_blk_buf.qs, blk[i].qs, qk / 2); + + memcpy(&new_blk[i], &new_blk_buf, sizeof(block_q4_0)); } } else if (type == GGML_TYPE_Q4_1) { int qk = ggml_blck_size(type); - const size_t nb = size / sizeof(block_q4_1); - block_q4_1 *blk = (block_q4_1 *)data; - block_q4_1 new_blk; + const size_t nb = *size / sizeof(block_q4_1_old); + block_q4_1_old *blk = (block_q4_1_old *)data; + block_q4_1 *new_blk = (block_q4_1 *)data; + block_q4_1 new_blk_buf; + *size = nb * sizeof(block_q4_1); for (size_t i = 0; i < nb ; i++) { - for (int j = 0; j < qk/4; j++) { - // old: d0, d1, d2, d3, d4, ....... d_half, d_half1 - // new: d0, d_half, d1, d_half1 - uint8_t d1; - uint8_t d2; + new_blk_buf.d = GGML_FP32_TO_FP16(blk[i].d); + new_blk_buf.m = GGML_FP32_TO_FP16(blk[i].m); - d1 = blk[i].qs[0 + j]; - d2 = blk[i].qs[qk/4 + j]; - - new_blk.qs[0 + j * 2] = (d1 & 0x0f) | ((d2 & 0x0f) << 4); - new_blk.qs[1 + j * 2] = (d1 >> 4) | (d2 & 0xf0); - } - memcpy(blk[i].qs, new_blk.qs, sizeof(new_blk.qs)); + if (shuffle) + quantize_shuffle_block(blk[i].qs, new_blk_buf.qs, qk/4); + else + memcpy(new_blk_buf.qs, blk[i].qs, qk / 2); + memcpy(&new_blk[i], &new_blk_buf, sizeof(block_q4_1)); } } else if (type == GGML_TYPE_Q5_0) { + // No size diff int qk = ggml_blck_size(type); - const size_t nb = size / sizeof(block_q5_0); + const size_t nb = *size / sizeof(block_q5_0); block_q5_0 *blk = (block_q5_0 *)data; block_q5_0 new_blk; for (size_t i = 0; i < nb ; i++) { - for (int j = 0; j < qk/4; j++) { - // old: d0, d1, d2, d3, d4, ....... d_half, d_half1 - // new: d0, d_half, d1, d_half1 - uint8_t d1; - uint8_t d2; - - d1 = blk[i].qs[0 + j]; - d2 = blk[i].qs[qk/4 + j]; - - new_blk.qs[0 + j * 2] = (d1 & 0x0f) | ((d2 & 0x0f) << 4); - new_blk.qs[1 + j * 2] = (d1 >> 4) | (d2 & 0xf0); - } + if (shuffle) + quantize_shuffle_block(blk[i].qs, new_blk.qs, qk/4); + else + memcpy(new_blk.qs, blk[i].qs, qk / 2); memcpy(blk[i].qs, new_blk.qs, sizeof(new_blk.qs)); } } else if (type == GGML_TYPE_Q5_1) { + // No size diff int qk = ggml_blck_size(type); - const size_t nb = size / sizeof(block_q5_1); + const size_t nb = *size / sizeof(block_q5_1); block_q5_1 *blk = (block_q5_1 *)data; block_q5_1 new_blk; for (size_t i = 0; i < nb ; i++) { - for (int j = 0; j < qk/4; j++) { - // old: d0, d1, d2, d3, d4, ....... d_half, d_half1 - // new: d0, d_half, d1, d_half1 - uint8_t d1; - uint8_t d2; + if (shuffle) + quantize_shuffle_block(blk[i].qs, new_blk.qs, qk/4); + else + memcpy(new_blk.qs, blk[i].qs, qk / 2); + memcpy(&blk[i], &new_blk, sizeof(new_blk)); + } + } else if (type == GGML_TYPE_Q8_0) { + // no shuffle + int qk = ggml_blck_size(type); + const size_t nb = *size / sizeof(block_q8_0_old); + block_q8_0_old *blk = (block_q8_0_old *)data; + block_q8_0 *new_blk = (block_q8_0 *)data; + block_q8_0 new_blk_buf; + + *size = nb * sizeof(block_q8_0); - d1 = blk[i].qs[0 + j]; - d2 = blk[i].qs[qk/4 + j]; + for (size_t i = 0; i < nb ; i++) { + new_blk_buf.d = GGML_FP32_TO_FP16(blk[i].d); - new_blk.qs[0 + j * 2] = (d1 & 0x0f) | ((d2 & 0x0f) << 4); - new_blk.qs[1 + j * 2] = (d1 >> 4) | (d2 & 0xf0); - } - memcpy(blk[i].qs, new_blk.qs, sizeof(new_blk.qs)); + memcpy(new_blk_buf.qs, blk[i].qs, qk / 2); + memcpy(&new_blk[i], &new_blk_buf, sizeof(block_q8_0)); } } } diff --git a/ggml.h b/ggml.h index 787f927cd..2ce9f51d1 100644 --- a/ggml.h +++ b/ggml.h @@ -1086,7 +1086,7 @@ extern "C" { GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); - GGML_API void quantize_upgrade(enum ggml_type type, void* data, size_t size); + GGML_API void quantize_upgrade(enum ggml_type type, void* data, size_t *size, bool needShuffle); // // system info // diff --git a/llama.cpp b/llama.cpp index f1a1acbf1..bc1e02017 100644 --- a/llama.cpp +++ b/llama.cpp @@ -271,6 +271,14 @@ struct llama_context { } }; +enum llama_file_version { + LLAMA_FILE_VERSION_GGML, + LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab + LLAMA_FILE_VERSION_GGJT_V1, // added padding + LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format + LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format +}; + template static T checked_mul(T a, T b) { T ret = a * b; @@ -305,6 +313,28 @@ static size_t llama_calc_tensor_size(const std::vector & ne, enum ggml return size / ggml_blck_size(type); } +static size_t llama_calc_tensor_size_prev3(const std::vector & ne, enum ggml_type type) { + size_t size = ggml_type_size(type); + + switch (type) + { + case GGML_TYPE_Q4_0: + size += 2; + break; + case GGML_TYPE_Q4_1: + size += 4; + break; + case GGML_TYPE_Q8_0: + size += 2; + break; + } + + for (uint32_t dim : ne) { + size = checked_mul(size, dim); + } + return size / ggml_blck_size(type); +} + struct llama_load_tensor_shard { std::vector ne; size_t size; @@ -312,8 +342,11 @@ struct llama_load_tensor_shard { size_t file_idx; size_t file_off; - void calc_size() { - size = llama_calc_tensor_size(ne, type); + void calc_size(llama_file_version file_version) { + if (file_version == LLAMA_FILE_VERSION_GGJT_V3) + size = llama_calc_tensor_size(ne, type); + else + size = llama_calc_tensor_size_prev3(ne, type); } }; @@ -336,11 +369,11 @@ struct llama_load_tensor { llama_load_tensor(const std::string & name) : name(name) {} - void calc_all() { + void calc_all(llama_file_version file_version) { calc_type(); calc_split_type(); calc_ne(); - calc_size(); + calc_size(file_version); } void calc_type() { @@ -392,8 +425,11 @@ struct llama_load_tensor { } } - void calc_size() { - size = llama_calc_tensor_size(ne, type); + void calc_size(llama_file_version file_version) { + if (file_version == LLAMA_FILE_VERSION_GGJT_V3) + size = llama_calc_tensor_size(ne, type); + else + size = llama_calc_tensor_size_prev3(ne, type); } }; @@ -403,14 +439,6 @@ struct llama_load_tensors_map { std::unordered_map name_to_idx; }; -enum llama_file_version { - LLAMA_FILE_VERSION_GGML, - LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab - LLAMA_FILE_VERSION_GGJT_V1, // added padding - LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format - LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format -}; - struct llama_file_loader { llama_file file; llama_file_version file_version; @@ -513,7 +541,7 @@ struct llama_file_loader { shard.file_idx = file_idx; shard.file_off = file.tell(); - shard.calc_size(); + shard.calc_size(file_version); file.seek(shard.size, SEEK_CUR); auto it = tensors_map.name_to_idx.find(name); @@ -618,7 +646,7 @@ struct llama_model_loader { } this->use_mmap = use_mmap; for (llama_load_tensor & lt : tensors_map.tensors) { - lt.calc_all(); + lt.calc_all(first_file->file_version); } } @@ -2074,18 +2102,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s size_t new_size; llama_buffer work; - if (model_loader->file_loaders.at(0)->file_version == LLAMA_FILE_VERSION_GGJT_V1 && quantize) { - if (((tensor.type == GGML_TYPE_Q4_0) - || (tensor.type == GGML_TYPE_Q4_1) - || (tensor.type == GGML_TYPE_Q5_0) - || (tensor.type == GGML_TYPE_Q5_1)) && (quantized_type == tensor.type)) { + bool needShuffle = (model_loader->file_loaders.at(0)->file_version == LLAMA_FILE_VERSION_GGJT_V1); + + if (model_loader->file_loaders.at(0)->file_version < LLAMA_FILE_VERSION_GGJT_V3 && quantize) { + if ((quantized_type == tensor.type) && + (tensor.type == GGML_TYPE_Q4_0 || tensor.type == GGML_TYPE_Q4_1 + || tensor.type == GGML_TYPE_Q5_0 || tensor.type == GGML_TYPE_Q5_1 + || tensor.type == GGML_TYPE_Q8_0)) { // convet new_type = tensor.type; new_data = tensor.data; new_size = tensor.size; - quantize_upgrade(new_type, new_data, new_size); - printf("Upgrade - size = %8.3f MB\n", tensor.size/1024.0/1024.0); - } else { + quantize_upgrade(new_type, new_data, &new_size, needShuffle); + printf("Upgrade - size = %8.3f MB\n", new_size/1024.0/1024.0); + } + else { throw format("type %s unsupported for quantization format upgrade", ggml_type_name(tensor.type)); } } else if (!quantize) { From 80f1faac87c267547328bc7c926f9dd4f2610226 Mon Sep 17 00:00:00 2001 From: Howard Su Date: Sun, 21 May 2023 22:31:19 +0800 Subject: [PATCH 5/6] format fix --- ggml.c | 20 ++++++++++++-------- llama.cpp | 12 ++++++++---- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/ggml.c b/ggml.c index 6cddb3f9e..1223afced 100644 --- a/ggml.c +++ b/ggml.c @@ -856,10 +856,11 @@ void quantize_upgrade(enum ggml_type type, void* data, size_t * size, bool shuff new_blk_buf.d = GGML_FP32_TO_FP16(blk[i].d); - if (shuffle) + if (shuffle) { quantize_shuffle_block(blk[i].qs, new_blk_buf.qs, qk/4); - else + } else { memcpy(new_blk_buf.qs, blk[i].qs, qk / 2); + } memcpy(&new_blk[i], &new_blk_buf, sizeof(block_q4_0)); } @@ -875,10 +876,11 @@ void quantize_upgrade(enum ggml_type type, void* data, size_t * size, bool shuff new_blk_buf.d = GGML_FP32_TO_FP16(blk[i].d); new_blk_buf.m = GGML_FP32_TO_FP16(blk[i].m); - if (shuffle) + if (shuffle) { quantize_shuffle_block(blk[i].qs, new_blk_buf.qs, qk/4); - else + } else { memcpy(new_blk_buf.qs, blk[i].qs, qk / 2); + } memcpy(&new_blk[i], &new_blk_buf, sizeof(block_q4_1)); } } else if (type == GGML_TYPE_Q5_0) { @@ -889,10 +891,11 @@ void quantize_upgrade(enum ggml_type type, void* data, size_t * size, bool shuff block_q5_0 new_blk; for (size_t i = 0; i < nb ; i++) { - if (shuffle) + if (shuffle) { quantize_shuffle_block(blk[i].qs, new_blk.qs, qk/4); - else + } else { memcpy(new_blk.qs, blk[i].qs, qk / 2); + } memcpy(blk[i].qs, new_blk.qs, sizeof(new_blk.qs)); } } else if (type == GGML_TYPE_Q5_1) { @@ -903,10 +906,11 @@ void quantize_upgrade(enum ggml_type type, void* data, size_t * size, bool shuff block_q5_1 new_blk; for (size_t i = 0; i < nb ; i++) { - if (shuffle) + if (shuffle) { quantize_shuffle_block(blk[i].qs, new_blk.qs, qk/4); - else + } else { memcpy(new_blk.qs, blk[i].qs, qk / 2); + } memcpy(&blk[i], &new_blk, sizeof(new_blk)); } } else if (type == GGML_TYPE_Q8_0) { diff --git a/llama.cpp b/llama.cpp index bc1e02017..5f4714be5 100644 --- a/llama.cpp +++ b/llama.cpp @@ -327,6 +327,8 @@ static size_t llama_calc_tensor_size_prev3(const std::vector & ne, enu case GGML_TYPE_Q8_0: size += 2; break; + default: + break; } for (uint32_t dim : ne) { @@ -343,10 +345,11 @@ struct llama_load_tensor_shard { size_t file_off; void calc_size(llama_file_version file_version) { - if (file_version == LLAMA_FILE_VERSION_GGJT_V3) + if (file_version == LLAMA_FILE_VERSION_GGJT_V3) { size = llama_calc_tensor_size(ne, type); - else + } else { size = llama_calc_tensor_size_prev3(ne, type); + } } }; @@ -426,10 +429,11 @@ struct llama_load_tensor { } void calc_size(llama_file_version file_version) { - if (file_version == LLAMA_FILE_VERSION_GGJT_V3) + if (file_version == LLAMA_FILE_VERSION_GGJT_V3) { size = llama_calc_tensor_size(ne, type); - else + } else { size = llama_calc_tensor_size_prev3(ne, type); + } } }; From 2257f9f691395352f57d01f8881003a617cee577 Mon Sep 17 00:00:00 2001 From: Howard Su Date: Sun, 21 May 2023 23:03:36 +0800 Subject: [PATCH 6/6] Remove trailing space --- ggml.c | 1 - llama.cpp | 6 ++---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/ggml.c b/ggml.c index 1223afced..4d6389a56 100644 --- a/ggml.c +++ b/ggml.c @@ -920,7 +920,6 @@ void quantize_upgrade(enum ggml_type type, void* data, size_t * size, bool shuff block_q8_0_old *blk = (block_q8_0_old *)data; block_q8_0 *new_blk = (block_q8_0 *)data; block_q8_0 new_blk_buf; - *size = nb * sizeof(block_q8_0); for (size_t i = 0; i < nb ; i++) { diff --git a/llama.cpp b/llama.cpp index 5f4714be5..c1892037b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2109,10 +2109,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s bool needShuffle = (model_loader->file_loaders.at(0)->file_version == LLAMA_FILE_VERSION_GGJT_V1); if (model_loader->file_loaders.at(0)->file_version < LLAMA_FILE_VERSION_GGJT_V3 && quantize) { - if ((quantized_type == tensor.type) && - (tensor.type == GGML_TYPE_Q4_0 || tensor.type == GGML_TYPE_Q4_1 - || tensor.type == GGML_TYPE_Q5_0 || tensor.type == GGML_TYPE_Q5_1 - || tensor.type == GGML_TYPE_Q8_0)) { + if ((quantized_type == tensor.type) && + (tensor.type == GGML_TYPE_Q4_0 || tensor.type == GGML_TYPE_Q4_1 || tensor.type == GGML_TYPE_Q5_0 || tensor.type == GGML_TYPE_Q5_1 || tensor.type == GGML_TYPE_Q8_0)) { // convet new_type = tensor.type; new_data = tensor.data;