From 519c981f8b65ee6c87c2965539685ced0a17223b Mon Sep 17 00:00:00 2001 From: slaren Date: Tue, 22 Aug 2023 16:03:12 +0200 Subject: [PATCH 1/3] embedding : evaluate prompt in batches (#2713) --- examples/embedding/embedding.cpp | 37 +++++++++++++++++++------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 8788571cb..38395c75b 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -72,23 +72,30 @@ int main(int argc, char ** argv) { fprintf(stderr, "\n"); } - if (params.embedding){ - if (embd_inp.size() > 0) { - if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads)) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return 1; - } - } - - const int n_embd = llama_n_embd(ctx); - const auto embeddings = llama_get_embeddings(ctx); - - for (int i = 0; i < n_embd; i++) { - printf("%f ", embeddings[i]); - } - printf("\n"); + if (embd_inp.size() > (size_t)params.n_ctx) { + fprintf(stderr, "%s: error: prompt is longer than the context window (%zu tokens, n_ctx = %d)\n", + __func__, embd_inp.size(), params.n_ctx); + return 1; } + while (!embd_inp.empty()) { + int n_tokens = std::min(params.n_batch, (int) embd_inp.size()); + if (llama_eval(ctx, embd_inp.data(), n_tokens, n_past, params.n_threads)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return 1; + } + n_past += n_tokens; + embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_tokens); + } + + const int n_embd = llama_n_embd(ctx); + const auto embeddings = llama_get_embeddings(ctx); + + for (int i = 0; i < n_embd; i++) { + printf("%f ", embeddings[i]); + } + printf("\n"); + llama_print_timings(ctx); llama_free(ctx); llama_free_model(model); From bac66994cf356cf488078c056831396eb4ce31d5 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Tue, 22 Aug 2023 19:14:09 +0300 Subject: [PATCH 2/3] Quantization imrovements for k_quants (#2707) * Improve LLaMA-2 2-, 3- and 4-bit quantization * Q3_K_S: use Q5_K for 1st 2 layers of attention.wv and feed_forward.w2 * Q4_K_S: use Q6_K for 1st 2 layers of attention.wv and feed_forward.w2 * Q2_K and Q3_K_M: use Q5_K instead of Q4_K for 1st 2 layers of attention.wv and feed_forward.w2 This leads to a slight model sized increase as follows: Q2_K : 2.684G vs 2.670G Q3_K_S: 2.775G vs 2.745G Q3_K_M: 3.071G vs 3.057G Q4_K_S: 3.592G vs 3.563G LLaMA-2 PPL for context 512 changes as follows: Q2_K : 6.6691 vs 6.8201 Q3_K_S: 6.2129 vs 6.2584 Q3_K_M: 6.0387 vs 6.1371 Q4_K_S: 5.9138 vs 6.0041 There are improvements for LLaMA-1 as well, but they are way smaller than the above. * Minor 4-bit quantization improvement For the same model size as previus commit, we get PPL = 5.9069 vs 5.9138. * Some more fine tuning * Adding make_qkx2_quants With it, we get PPL = 5.8828 for L2-7B Q4_K_S. * Another minor improvement * Q2_K improvement Smaller model, lower perplexity. 7B: file size = 2.632G, PPL = 6.3772 vs original 2.670G PPL = 6.8201 12B: file size = 5.056G, PPL = 5.4577 vs original 5.130G PPL = 5.7178 It is mostly Q3_K except for tok_embeddings, attention.wq, attention.wk, which are Q2_K * Iterating * Revert Q5_K back to make_qkx1_quants * Better Q6_K * make_qkx2_quants is better for Q5_K after all * Fix after rebasing on master * Fix for changed tensor names --------- Co-authored-by: Iwan Kawrakow --- k_quants.c | 164 +++++++++++++++++++++++++++++++++++------------------ llama.cpp | 24 ++++++-- 2 files changed, 130 insertions(+), 58 deletions(-) diff --git a/k_quants.c b/k_quants.c index 6348fce6b..82bf81697 100644 --- a/k_quants.c +++ b/k_quants.c @@ -77,6 +77,11 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * } return 1/iscale; } + bool return_early = false; + if (rmse_type < 0) { + rmse_type = -rmse_type; + return_early = true; + } int weight_type = rmse_type%2; float sumlx = 0; float suml2 = 0; @@ -89,56 +94,9 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * suml2 += w*l*l; } float scale = sumlx/suml2; + if (return_early) return suml2 > 0 ? 0.5f*(scale + 1/iscale) : 1/iscale; float best = scale * sumlx; - for (int itry = 0; itry < 3; ++itry) { - iscale = 1/scale; - float slx = 0; - float sl2 = 0; - bool changed = false; - for (int i = 0; i < n; ++i) { - int l = nearest_int(iscale * x[i]); - l = MAX(-nmax, MIN(nmax-1, l)); - if (l + nmax != L[i]) { changed = true; } - float w = weight_type == 1 ? x[i] * x[i] : 1.f; - slx += w*x[i]*l; - sl2 += w*l*l; - } - if (!changed || sl2 == 0 || slx*slx <= best*sl2) { break; } - for (int i = 0; i < n; ++i) { - int l = nearest_int(iscale * x[i]); - L[i] = nmax + MAX(-nmax, MIN(nmax-1, l)); - } - sumlx = slx; suml2 = sl2; - scale = sumlx/suml2; - best = scale * sumlx; - } - for (int itry = 0; itry < 5; ++itry) { - int n_changed = 0; - for (int i = 0; i < n; ++i) { - float w = weight_type == 1 ? x[i]*x[i] : 1; - int l = L[i] - nmax; - float slx = sumlx - w*x[i]*l; - if (slx > 0) { - float sl2 = suml2 - w*l*l; - int new_l = nearest_int(x[i] * sl2 / slx); - new_l = MAX(-nmax, MIN(nmax-1, new_l)); - if (new_l != l) { - slx += w*x[i]*new_l; - sl2 += w*new_l*new_l; - if (sl2 > 0 && slx*slx*suml2 > sumlx*sumlx*sl2) { - L[i] = nmax + new_l; sumlx = slx; suml2 = sl2; - scale = sumlx / suml2; best = scale * sumlx; - ++n_changed; - } - } - } - } - if (!n_changed) { break; } - } - if (rmse_type < 3) { - return scale; - } - for (int is = -4; is <= 4; ++is) { + for (int is = -9; is <= 9; ++is) { if (is == 0) { continue; } @@ -221,12 +179,17 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * return 1/iscale; } -static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min, int ntry) { +static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min, + int ntry, float alpha) { float min = x[0]; float max = x[0]; + float sum_x = 0; + float sum_x2 = 0; for (int i = 1; i < n; ++i) { if (x[i] < min) min = x[i]; if (x[i] > max) max = x[i]; + sum_x += x[i]; + sum_x2 += x[i]*x[i]; } if (max == min) { for (int i = 0; i < n; ++i) L[i] = 0; @@ -254,7 +217,7 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t for (int i = 0; i < n; ++i) { sum += x[i] - scale*L[i]; } - min = sum/n; + min = alpha*min + (1 - alpha)*sum/n; if (min > 0) min = 0; iscale = 1/scale; if (!did_change) break; @@ -263,6 +226,82 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t return scale; } +static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights, + uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux, + float rmin, float rdelta, int nstep, bool use_mad) { + float min = x[0]; + float max = x[0]; + float sum_w = weights[0]; + float sum_x = sum_w * x[0]; + for (int i = 1; i < n; ++i) { + if (x[i] < min) min = x[i]; + if (x[i] > max) max = x[i]; + float w = weights[i]; + sum_w += w; + sum_x += w * x[i]; + } + if (min > 0) min = 0; + if (max == min) { + for (int i = 0; i < n; ++i) L[i] = 0; + *the_min = -min; + return 0.f; + } + float iscale = nmax/(max - min); + float scale = 1/iscale; + float best_mad = 0; + for (int i = 0; i < n; ++i) { + int l = nearest_int(iscale*(x[i] - min)); + L[i] = MAX(0, MIN(nmax, l)); + float diff = scale * L[i] + min - x[i]; + diff = use_mad ? fabsf(diff) : diff * diff; + float w = weights[i]; + best_mad += w * diff; + } + if (nstep < 1) { + *the_min = -min; + return scale; + } + for (int is = 0; is <= nstep; ++is) { + iscale = (rmin + rdelta*is + nmax)/(max - min); + float sum_l = 0, sum_l2 = 0, sum_xl = 0; + for (int i = 0; i < n; ++i) { + int l = nearest_int(iscale*(x[i] - min)); + l = MAX(0, MIN(nmax, l)); + Laux[i] = l; + float w = weights[i]; + sum_l += w*l; + sum_l2 += w*l*l; + sum_xl += w*l*x[i]; + } + float D = sum_w * sum_l2 - sum_l * sum_l; + if (D > 0) { + float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D; + float this_min = (sum_l2 * sum_x - sum_l * sum_xl)/D; + if (this_min > 0) { + this_min = 0; + this_scale = sum_xl / sum_l2; + } + float mad = 0; + for (int i = 0; i < n; ++i) { + float diff = this_scale * Laux[i] + this_min - x[i]; + diff = use_mad ? fabsf(diff) : diff * diff; + float w = weights[i]; + mad += w * diff; + } + if (mad < best_mad) { + for (int i = 0; i < n; ++i) { + L[i] = Laux[i]; + } + best_mad = mad; + scale = this_scale; + min = this_min; + } + } + } + *the_min = -min; + return scale; +} + #if QK_K == 256 static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) { if (j < 4) { @@ -281,6 +320,8 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict const int nb = k / QK_K; uint8_t L[QK_K]; + uint8_t Laux[16]; + float weights[16]; float mins[QK_K/16]; float scales[QK_K/16]; @@ -291,7 +332,8 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict float max_scale = 0; // as we are deducting the min, scales are always positive float max_min = 0; for (int j = 0; j < QK_K/16; ++j) { - scales[j] = make_qkx1_quants(16, 3, x + 16*j, L + 16*j, &mins[j], 5); + for (int l = 0; l < 16; ++l) weights[l] = fabsf(x[16*j + l]); + scales[j] = make_qkx2_quants(16, 3, x + 16*j, weights, L + 16*j, &mins[j], Laux, -0.5f, 0.1f, 15, true); float scale = scales[j]; if (scale > max_scale) { max_scale = scale; @@ -637,6 +679,8 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict const int nb = k / QK_K; uint8_t L[QK_K]; + uint8_t Laux[32]; + float weights[32]; float mins[QK_K/32]; float scales[QK_K/32]; @@ -645,7 +689,12 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict float max_scale = 0; // as we are deducting the min, scales are always positive float max_min = 0; for (int j = 0; j < QK_K/32; ++j) { - scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 5); + //scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 9, 0.5f); + float sum_x2 = 0; + for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l]; + float av_x = sqrtf(sum_x2/32); + for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]); + scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false); float scale = scales[j]; if (scale > max_scale) { max_scale = scale; @@ -798,6 +847,8 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict uint8_t L[QK_K]; float mins[QK_K/32]; float scales[QK_K/32]; + float weights[32]; + uint8_t Laux[32]; #else int8_t L[QK_K]; float scales[QK_K/16]; @@ -810,7 +861,12 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict float max_scale = 0; // as we are deducting the min, scales are always positive float max_min = 0; for (int j = 0; j < QK_K/32; ++j) { - scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 5); + //scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f); + float sum_x2 = 0; + for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l]; + float av_x = sqrtf(sum_x2/32); + for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]); + scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false); float scale = scales[j]; if (scale > max_scale) { max_scale = scale; diff --git a/llama.cpp b/llama.cpp index 8b151dc84..0584749c5 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3547,24 +3547,40 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s new_type = GGML_TYPE_Q6_K; } } else if (name.find("attn_v.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { + new_type = i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) && (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K; ++i_attention_wv; } else if (name.find("ffn_down.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { + new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; - //else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < 4) new_type = GGML_TYPE_Q5_K; ++i_feed_forward_w2; } else if (name.find("attn_output.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; } + else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; + } + // This can be used to reduce the size of the Q5_K_S model. + // The associated PPL increase is fully in line with the size reduction + //else { + // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K; + //} bool convert_incompatible_tensor = false; if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) { From deb7dfca4b9725cd295d1426db75fe8e0a6d5312 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 22 Aug 2023 20:05:59 +0300 Subject: [PATCH 3/3] gguf : add ftype meta info to the model (#2710) * llama : add ftype meta info to the model ggml-ci * convert.py : add ftype when converting (does not work) * convert.py : fix Enum to IntEnum ggml-ci --- convert.py | 29 +++++++++++++++++++++++------ gguf.py | 4 ++++ llama.cpp | 21 ++++++++++++++++++--- llama.h | 2 ++ 4 files changed, 47 insertions(+), 9 deletions(-) diff --git a/convert.py b/convert.py index c29c032cd..71978d671 100644 --- a/convert.py +++ b/convert.py @@ -69,7 +69,10 @@ SAFETENSORS_DATA_TYPES: Dict[str, DataType] = { 'I32': DT_I32, } -class GGMLFileType(enum.Enum): +# TODO: match this with `llama_ftype` +# TODO: rename to LLAMAFileType +# TODO: move to `gguf.py` +class GGMLFileType(enum.IntEnum): AllF32 = 0 MostlyF16 = 1 # except 1d tensors @@ -101,6 +104,8 @@ class Params: n_head_kv: int f_norm_eps: float + ftype: Optional[GGMLFileType] = None + @staticmethod def find_n_mult(n_ff: int, n_embd: int) -> int: # hardcoded magic range @@ -738,6 +743,9 @@ class OutputFile: self.gguf.add_head_count_kv (params.n_head_kv) self.gguf.add_layer_norm_rms_eps (params.f_norm_eps) + if params.ftype: + self.gguf.add_file_type(params.ftype) + def add_meta_vocab(self, vocab: Vocab) -> None: tokens = [] scores = [] @@ -1020,6 +1028,12 @@ def main(args_in: Optional[List[str]] = None) -> None: " - LLaMA v2: --ctx 4096\n") params.n_ctx = args.ctx + if args.outtype: + params.ftype = { + "f32": GGMLFileType.AllF32, + "f16": GGMLFileType.MostlyF16, + }[args.outtype] + print(f"params = {params}") vocab: Vocab @@ -1040,11 +1054,14 @@ def main(args_in: Optional[List[str]] = None) -> None: vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent vocab = load_vocab(vocab_dir, args.vocabtype) - model = model_plus.model - model = convert_model_names(model, params) - output_type = pick_output_type(model, args.outtype) - model = convert_to_output_type(model, output_type) - outfile = args.outfile or default_outfile(model_plus.paths, output_type) + model = model_plus.model + model = convert_model_names(model, params) + ftype = pick_output_type(model, args.outtype) + model = convert_to_output_type(model, ftype) + outfile = args.outfile or default_outfile(model_plus.paths, ftype) + + params.ftype = ftype + print(f"Writing {outfile}, format {ftype}") OutputFile.write_all(outfile, params, model, vocab) print(f"Wrote {outfile}") diff --git a/gguf.py b/gguf.py index 9776649c7..465746718 100644 --- a/gguf.py +++ b/gguf.py @@ -26,6 +26,7 @@ KEY_GENERAL_DESCRIPTION = "general.description" KEY_GENERAL_LICENSE = "general.license" KEY_GENERAL_SOURCE_URL = "general.source.url" KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository" +KEY_GENERAL_FILE_TYPE = "general.file_type" # LLM KEY_LLM_CONTEXT_LENGTH = "{arch}.context_length" @@ -595,6 +596,9 @@ class GGUFWriter: def add_source_hf_repo(self, repo: str): self.add_string(KEY_GENERAL_SOURCE_HF_REPO, repo) + def add_file_type(self, ftype: int): + self.add_uint32(KEY_GENERAL_FILE_TYPE, ftype) + def add_name(self, name: str): self.add_string(KEY_GENERAL_NAME, name) diff --git a/llama.cpp b/llama.cpp index 0584749c5..6abdc44f2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -995,6 +995,16 @@ struct llama_model_loader { } break; } + // this is a way to mark that we have "guessed" the file type + ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED); + + { + const int kid = gguf_find_key(ctx_gguf, "general.file_type"); + if (kid >= 0) { + ftype = (llama_ftype) gguf_get_val_u32(ctx_gguf, kid); + } + } + for (int i = 0; i < n_kv; i++) { const char * name = gguf_get_key(ctx_gguf, i); const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i); @@ -1197,7 +1207,11 @@ struct llama_model_loader { // load LLaMA models // -const char * llama_model_ftype_name(enum llama_ftype ftype) { +std::string llama_model_ftype_name(enum llama_ftype ftype) { + if (ftype & LLAMA_FTYPE_GUESSED) { + return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)"; + } + switch (ftype) { case LLAMA_FTYPE_ALL_F32: return "all F32"; case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16"; @@ -1426,7 +1440,7 @@ static void llama_model_load_internal( LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base); LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type)); - LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype)); + LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str()); LLAMA_LOG_INFO("%s: model size = %.2f B\n", __func__, ml->n_elements*1e-9); // general kv @@ -3450,6 +3464,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // copy the KV pairs from the input file gguf_set_kv (ctx_out, model_loader->ctx_gguf); gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); + gguf_set_val_u32(ctx_out, "general.file_type", ftype); #ifdef GGML_USE_K_QUANTS int n_attention_wv = 0; @@ -4310,7 +4325,7 @@ int llama_model_n_embd(const struct llama_model * model) { } int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) { - return snprintf(buf, buf_size, "LLaMA %s %s", llama_model_type_name(model->type), llama_model_ftype_name(model->ftype)); + return snprintf(buf, buf_size, "LLaMA %s %s", llama_model_type_name(model->type), llama_model_ftype_name(model->ftype).c_str()); } int llama_model_quantize( diff --git a/llama.h b/llama.h index aa5b7d69c..7ce478d54 100644 --- a/llama.h +++ b/llama.h @@ -103,6 +103,8 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors + + LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; typedef struct llama_token_data {