Reviewer comments
This commit is contained in:
parent
ce05fc0a67
commit
b65e559a68
2 changed files with 13 additions and 11 deletions
6
ggml.c
6
ggml.c
|
@ -11875,19 +11875,19 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
{
|
{
|
||||||
assert (start % QK4_0 == 0);
|
GGML_ASSERT(start % QK4_0 == 0);
|
||||||
block_q4_0 * block = (block_q4_0*)dst + start / QK4_0;
|
block_q4_0 * block = (block_q4_0*)dst + start / QK4_0;
|
||||||
result = ggml_quantize_q4_0(src + start, block, n, n, hist);
|
result = ggml_quantize_q4_0(src + start, block, n, n, hist);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q4_1:
|
case GGML_TYPE_Q4_1:
|
||||||
{
|
{
|
||||||
assert (start % QK4_1 == 0);
|
GGML_ASSERT(start % QK4_1 == 0);
|
||||||
block_q4_1 * block = (block_q4_1*)dst + start / QK4_1;
|
block_q4_1 * block = (block_q4_1*)dst + start / QK4_1;
|
||||||
result = ggml_quantize_q4_1(src + start, block, n, n, hist);
|
result = ggml_quantize_q4_1(src + start, block, n, n, hist);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q4_2:
|
case GGML_TYPE_Q4_2:
|
||||||
{
|
{
|
||||||
assert (start % QK4_2 == 0);
|
GGML_ASSERT(start % QK4_2 == 0);
|
||||||
block_q4_2 * block = (block_q4_2*)dst + start / QK4_2;
|
block_q4_2 * block = (block_q4_2*)dst + start / QK4_2;
|
||||||
result = ggml_quantize_q4_2(src + start, block, n, n, hist);
|
result = ggml_quantize_q4_2(src + start, block, n, n, hist);
|
||||||
} break;
|
} break;
|
||||||
|
|
18
llama.cpp
18
llama.cpp
|
@ -1581,7 +1581,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
default: throw format("invalid output file type %d\n", ftype);
|
default: throw format("invalid output file type %d\n", ftype);
|
||||||
};
|
};
|
||||||
|
|
||||||
if (nthread <= 0) nthread = std::thread::hardware_concurrency();
|
if (nthread <= 0) {
|
||||||
|
nthread = std::thread::hardware_concurrency();
|
||||||
|
}
|
||||||
|
|
||||||
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
|
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
|
||||||
/*vocab_only*/ false));
|
/*vocab_only*/ false));
|
||||||
|
@ -1647,15 +1649,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
new_data = work.addr;
|
new_data = work.addr;
|
||||||
std::vector<int64_t> hist_cur(1 << 4, 0);
|
std::vector<int64_t> hist_cur(1 << 4, 0);
|
||||||
|
|
||||||
int chunk_size = 32 * 512;
|
const int chunk_size = 32 * 512;
|
||||||
int nchunk = (nelements + chunk_size - 1)/chunk_size;
|
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
|
||||||
int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
|
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
|
||||||
if (nthread_use < 2) {
|
if (nthread_use < 2) {
|
||||||
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
|
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
|
||||||
} else {
|
} else {
|
||||||
size_t counter = 0;
|
size_t counter = 0;
|
||||||
new_size = 0;
|
new_size = 0;
|
||||||
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
|
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements] () {
|
||||||
std::vector<int64_t> local_hist;
|
std::vector<int64_t> local_hist;
|
||||||
size_t local_size = 0;
|
size_t local_size = 0;
|
||||||
while (true) {
|
while (true) {
|
||||||
|
@ -1674,10 +1676,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
|
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
if (int(workers.size()) < nthread_use-1) workers.resize(nthread_use-1);
|
if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
|
||||||
for (int it=0; it<nthread_use-1; ++it) workers[it] = std::thread(compute);
|
for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
|
||||||
compute();
|
compute();
|
||||||
for (int it=0; it<nthread_use-1; ++it) workers[it].join();
|
for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue