ggml : add Q4_3 quantization (#1082)

This commit is contained in:
Georgi Gerganov 2023-04-20 20:35:53 +03:00 committed by GitHub
parent 6a9661ea5a
commit e0305ead3a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 293 additions and 37 deletions

View file

@ -479,6 +479,7 @@ struct llama_file_loader {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q4_2:
case GGML_TYPE_Q4_3:
break;
default: {
throw format("unrecognized tensor type %u\n", shard.type);
@ -552,6 +553,7 @@ struct llama_file_saver {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q4_2:
case GGML_TYPE_Q4_3:
break;
default: LLAMA_ASSERT(false);
}
@ -841,6 +843,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
return "mostly Q4_1, some F16";
case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
default: return "unknown, may not work";
}
}
@ -1575,6 +1578,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
default: throw format("invalid output file type %d\n", ftype);
};
@ -1652,6 +1656,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
{
new_size = ggml_quantize_q4_2(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
} break;
case GGML_TYPE_Q4_3:
{
new_size = ggml_quantize_q4_3(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
} break;
default:
LLAMA_ASSERT(false);
}
@ -1963,7 +1971,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
base_t = dest_t;
}
if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1 || base_t->type == GGML_TYPE_Q4_2) {
if (ggml_is_quantized(base_t->type)) {
if (!warned) {
fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
"use a f16 or f32 base model with --lora-base\n", __func__);