ggml : add IQ2 to test-backend-ops + refactoring (#4990)
* ggml : add IQ2 to test-backend-ops + refactoring ggml-ci * cuda : update supports_op for IQ2 ggml-ci * ci : enable LLAMA_CUBLAS=1 for CUDA nodes ggml-ci * cuda : fix out-of-bounds-access in `mul_mat_vec_q` ggml-ci * tests : avoid creating RNGs for each Q tensor ggml-ci * tests : avoid creating RNGs for each tensor ggml-ci
This commit is contained in:
parent
ba69bbc84c
commit
38566680cd
9 changed files with 128 additions and 87 deletions
12
llama.cpp
12
llama.cpp
|
@ -8747,8 +8747,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
// placeholder for the meta data
|
||||
::zeros(fout, meta_size);
|
||||
|
||||
std::set<ggml_type> used_iq2;
|
||||
|
||||
for (int i = 0; i < ml.n_tensors; ++i) {
|
||||
struct ggml_tensor * tensor = ml.get_tensor_meta(i);
|
||||
|
||||
|
@ -8801,11 +8799,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
} else {
|
||||
const size_t nelements = ggml_nelements(tensor);
|
||||
|
||||
if ((new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_XS) && used_iq2.find(new_type) == used_iq2.end()) {
|
||||
ggml_init_iq2_quantization(new_type);
|
||||
used_iq2.insert(new_type);
|
||||
}
|
||||
|
||||
const float * imatrix = nullptr;
|
||||
if (imatrix_data) {
|
||||
auto it = imatrix_data->find(tensor->name);
|
||||
|
@ -8931,10 +8924,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
|
||||
fout.close();
|
||||
|
||||
for (auto type : used_iq2) {
|
||||
ggml_deinit_iq2_quantization(type);
|
||||
}
|
||||
|
||||
gguf_free(ctx_out);
|
||||
|
||||
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
||||
|
@ -9342,6 +9331,7 @@ void llama_backend_free(void) {
|
|||
#ifdef GGML_USE_MPI
|
||||
ggml_mpi_backend_free();
|
||||
#endif
|
||||
ggml_quantize_free();
|
||||
}
|
||||
|
||||
int64_t llama_time_us(void) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue