ggml : add Q8_0 quantization format (rename the old one to Q8_1) (ARM NEON) (#1179)

* ggml : add Q8_0 quantization format (rename the old one to Q8_1)

* tests : fix test-quantize-fns

* ggml : finalize Q8_0 implementation

* ggml : use q4_0_q8_0 and q4_2_q8_0

* ggml : fix Q8_0 dot product bug (ARM)

* ggml : Q8_0 unroll x2

* ggml : fix bug - using wrong block type

* ggml : extend quantize_fns_t with "vec_dot_type"

* ggml : fix Q8_0 to use 255 values out of 256

* ggml : fix assert using wrong QK4_2 instead of QK4_3
This commit is contained in:
Georgi Gerganov 2023-04-25 23:40:51 +03:00 committed by GitHub
parent dd0eabc049
commit 7a32fcb3b2
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 312 additions and 147 deletions

View file

@ -484,6 +484,7 @@ struct llama_file_loader {
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q4_2:
case GGML_TYPE_Q4_3:
case GGML_TYPE_Q8_0:
break;
default: {
throw format("unrecognized tensor type %u\n", shard.type);
@ -558,6 +559,7 @@ struct llama_file_saver {
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q4_2:
case GGML_TYPE_Q4_3:
case GGML_TYPE_Q8_0:
break;
default: LLAMA_ASSERT(false);
}
@ -848,6 +850,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
return "mostly Q4_1, some F16";
case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
default: return "unknown, may not work";
}
}
@ -1585,6 +1588,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
default: throw format("invalid output file type %d\n", ftype);
};