ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)
* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0 * llama : bump LLAMA_FILE_VERSION to 3 * cuda : update Q4 and Q8 dequantize kernels * ggml : fix AVX dot products * readme : update performance table + hot topics
This commit is contained in:
parent
6986c7835a
commit
2d5db48371
6 changed files with 109 additions and 102 deletions
18
llama.cpp
18
llama.cpp
|
@ -406,6 +406,7 @@ enum llama_file_version {
|
|||
LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
|
||||
LLAMA_FILE_VERSION_GGJT_V1, // added padding
|
||||
LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
|
||||
LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format
|
||||
};
|
||||
|
||||
struct llama_file_loader {
|
||||
|
@ -438,6 +439,8 @@ struct llama_file_loader {
|
|||
file_version = LLAMA_FILE_VERSION_GGJT_V1;
|
||||
} else if (magic == 'ggjt' && version == 2) {
|
||||
file_version = LLAMA_FILE_VERSION_GGJT_V2;
|
||||
} else if (magic == 'ggjt' && version == 3) {
|
||||
file_version = LLAMA_FILE_VERSION_GGJT_V3;
|
||||
} else {
|
||||
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
||||
magic, version);
|
||||
|
@ -844,7 +847,8 @@ static const char *llama_file_version_name(llama_file_version version) {
|
|||
case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
|
||||
case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
|
||||
case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
|
||||
case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)";
|
||||
case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)";
|
||||
case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)";
|
||||
}
|
||||
|
||||
return "unknown";
|
||||
|
@ -924,11 +928,19 @@ static void llama_model_load_internal(
|
|||
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
||||
}
|
||||
|
||||
if (file_version != LLAMA_FILE_VERSION_GGJT_V2) {
|
||||
if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
|
||||
if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
|
||||
hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
|
||||
hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
|
||||
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");
|
||||
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
|
||||
}
|
||||
}
|
||||
|
||||
if (file_version < LLAMA_FILE_VERSION_GGJT_V3) {
|
||||
if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
||||
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
|
||||
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
|
||||
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue