Arm AArch64: add multithreaded quantization support for the new types: Q4_0_4_4, Q4_0_4_8, and Q4_0_8_8

2024-06-19 06:15:28 +00:00 · 2024-06-19 06:15:28 +00:00 · cce236bc47
commit cce236bc47
parent a7055b7be5
1 changed files with 7 additions and 3 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -18395,9 +18395,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                f32_data = (float *) f32_conv_buf.data();
            }

+            int chunk_size_multiplier = 1;
            if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
-                if ((nelements / tensor->ne[0]) % 4 != 0) new_type = GGML_TYPE_Q4_0;
-                if (nthread > 1) nthread = 1;
+                if ((new_type == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = GGML_TYPE_Q4_0;
+                else if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0;
+                if (new_type == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8;
+                else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
            }

            LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
@ -18412,7 +18415,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            const int64_t nrows = tensor->ne[1];

            static const int64_t min_chunk_size = 32 * 512;
-            const int64_t chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
+            const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) *
+                                       chunk_size_multiplier;

            const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
            const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;