diff --git a/ggml.c b/ggml.c index ad1356505..42b9c1c6e 100644 --- a/ggml.c +++ b/ggml.c @@ -11875,19 +11875,19 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i switch (type) { case GGML_TYPE_Q4_0: { - assert (start % QK4_0 == 0); + GGML_ASSERT(start % QK4_0 == 0); block_q4_0 * block = (block_q4_0*)dst + start / QK4_0; result = ggml_quantize_q4_0(src + start, block, n, n, hist); } break; case GGML_TYPE_Q4_1: { - assert (start % QK4_1 == 0); + GGML_ASSERT(start % QK4_1 == 0); block_q4_1 * block = (block_q4_1*)dst + start / QK4_1; result = ggml_quantize_q4_1(src + start, block, n, n, hist); } break; case GGML_TYPE_Q4_2: { - assert (start % QK4_2 == 0); + GGML_ASSERT(start % QK4_2 == 0); block_q4_2 * block = (block_q4_2*)dst + start / QK4_2; result = ggml_quantize_q4_2(src + start, block, n, n, hist); } break; diff --git a/llama.cpp b/llama.cpp index f576345cb..f48149209 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1581,7 +1581,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s default: throw format("invalid output file type %d\n", ftype); }; - if (nthread <= 0) nthread = std::thread::hardware_concurrency(); + if (nthread <= 0) { + nthread = std::thread::hardware_concurrency(); + } std::unique_ptr model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false, /*vocab_only*/ false)); @@ -1647,15 +1649,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s new_data = work.addr; std::vector hist_cur(1 << 4, 0); - int chunk_size = 32 * 512; - int nchunk = (nelements + chunk_size - 1)/chunk_size; - int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1; + const int chunk_size = 32 * 512; + const int nchunk = (nelements + chunk_size - 1)/chunk_size; + const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1; if (nthread_use < 2) { new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data()); } else { size_t counter = 0; new_size = 0; - auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () { + auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements] () { std::vector local_hist; size_t local_size = 0; while (true) { @@ -1674,10 +1676,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data()); } }; - if (int(workers.size()) < nthread_use-1) workers.resize(nthread_use-1); - for (int it=0; it %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);